In [1]:
import pandas as pd
import sqlite3
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
conn.close()

# Identify non-numerical columns
non_numerical_cols = data.select_dtypes(include=['object']).columns
original_to_encoded_cols = {}
for col in non_numerical_cols:
    encoded_cols = [f"{col}_{value}" for value in data[col].unique()]
    original_to_encoded_cols[col] = encoded_cols
# Apply one-hot encoding to non-numerical columns
data = pd.get_dummies(data, columns=non_numerical_cols)
X = data.drop('CSAT_Score', axis=1)
y = data['CSAT_Score'].astype(int)

# Split the data into smaller batches
batch_size = 10000
X_batches = [X[i:i+batch_size] for i in range(0, len(X), batch_size)]
y_batches = [y[i:i+batch_size] for i in range(0, len(y), batch_size)]

# Original CSAT distribution
original_counts = {
    1: 5348,
    2: 394,   
    3: 730,
    4: 2605,
    5: 19679  
}

csat_5_target = 8000
csat_2_target = csat_5_target / 2
multiplier = csat_2_target / original_counts[2]
desired_ratio = {score: int(count * multiplier) for score, count in original_counts.items() if score != 5}
desired_ratio[5] = csat_5_target  

over = SMOTE(sampling_strategy=desired_ratio)
under = RandomUnderSampler(sampling_strategy={5: csat_5_target}) 
resample_pipeline = Pipeline(steps=[('o', over), ('u', under)])

# Adding the batches one by one 
resampled_batches = []
for X_batch, y_batch in zip(X_batches, y_batches):
    X_resampled, y_resampled = resample_pipeline.fit_resample(X_batch, y_batch)
    resampled_batches.append((X_resampled, y_resampled))


balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns) for X_resampled, _ in resampled_batches])
balanced_data['CSAT_Score'] = pd.concat([pd.Series(y_resampled) for _, y_resampled in resampled_batches], ignore_index=True)
#Reverting to original values
reverted_categorical_cols = pd.DataFrame()
for col in non_numerical_cols:
    encoded_cols = [c for c in balanced_data.columns if c.startswith(col + '_')]
    reverted_col = balanced_data[encoded_cols].idxmax(axis=1).apply(lambda x: x.rsplit('_', 1)[-1])
    reverted_categorical_cols[col] = reverted_col

balanced_data = balanced_data.drop(columns=sum(original_to_encoded_cols.values(), []))
balanced_data = pd.concat([balanced_data, reverted_categorical_cols], axis=1)
conn = sqlite3.connect('balance_customer_support.db')
balanced_data.to_sql('balanced_customer_support', conn, if_exists='replace', index=False)
conn.close()

print("Balanced dataset saved to 'balance_customer_support.db'.")


MemoryError: Unable to allocate 34.1 GiB for an array with shape (53267, 85798) and data type float64