In [5]:
import pandas as pd
import sqlite3
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Load the dataset from SQLite database
conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
conn.close()

# Identify non-numerical columns
non_numerical_cols = data.select_dtypes(include=['object']).columns

# Apply one-hot encoding to non-numerical columns
data = pd.get_dummies(data, columns=non_numerical_cols)

# Separate your features and target variable
X = data.drop('CSAT_Score', axis=1)
y = data['CSAT_Score'].astype(int)  # Ensuring that CSAT_Score is of integer type

# Define the resampling strategies
desired_ratio = {
    1: 10000,  
    2: 7000,   
    3: 9000,   
    4: 13000   
}
over = SMOTE(sampling_strategy=desired_ratio)
under = RandomUnderSampler(sampling_strategy={5: 20000})  # You might still want to reduce the majority class a bit

# Combine the resampling strategies into a pipeline
resample_pipeline = Pipeline(steps=[('o', over), ('u', under)])

# Apply the pipeline to resample the dataset
X_resampled, y_resampled = resample_pipeline.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['CSAT_Score'] = y_resampled

# Save the balanced dataset to a new SQLite database
conn = sqlite3.connect('balance_customer_support.db')
balanced_data.to_sql('balanced_customer_support', conn, if_exists='replace', index=False)
conn.close()

print("Balanced dataset saved to 'balance_customer_support.db'.")



MemoryError: Unable to allocate 18.4 GiB for an array with shape (85798, 28742) and data type float64