In [6]:
import pandas as pd
import sqlite3,gc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from textblob import TextBlob

# Function to calculate sentiment score
def calculate_sentiment(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return None
    
conn = sqlite3.connect('cleaned_customer_support.db')
data = pd.read_sql_query("SELECT * FROM cleaned_customer_support", conn)
conn.close()

# Drop sentiment_score before resampling, as it doesn't make sense for synthetic data
if 'sentiment_score' in data.columns:
    data = data.drop(columns=['sentiment_score'])

# Identify non-numerical columns
non_numerical_cols = data.select_dtypes(include=['object']).columns
original_to_encoded_cols = {}
for col in non_numerical_cols:
    encoded_cols = [f"{col}_{value}" for value in data[col].unique()]
    original_to_encoded_cols[col] = encoded_cols
# Apply one-hot encoding to non-numerical columns
data = pd.get_dummies(data, columns=non_numerical_cols)
X = data.drop('CSAT_Score', axis=1)
y = data['CSAT_Score'].astype(int)

# Split the data into smaller batches
batch_size = 10000
X_batches = [X[i:i+batch_size] for i in range(0, len(X), batch_size)]
y_batches = [y[i:i+batch_size] for i in range(0, len(y), batch_size)]
print(f"Num of batches:{len(X_batches)}")
# Define the resampling strategies
original_counts = {
    1: 5348,
    2: 394,   
    3: 730,
    4: 2605,
    5: 19679  
}
desired_ratio = {
    2: int(original_counts[2] / len(X_batches) * 2.5),   
    3: int(original_counts[3] / len(X_batches) * 2.3),   
    4: int(original_counts[4] / len(X_batches) * 2)
}
undersampling_ratio = {
  1: int(original_counts[1] / len(X_batches) * 0.75), 
  5: int(original_counts[5] / len(X_batches)*0.65)
}

over = SMOTE(sampling_strategy=desired_ratio,random_state=42)
under = RandomUnderSampler(sampling_strategy=undersampling_ratio,random_state=42)  
resample_pipeline = Pipeline(steps=[('o', over), ('u', under)])

print("Desired Ratios for Oversampling:")
for class_label, ratio in desired_ratio.items():
    print(f"Class {class_label}: {ratio}")
print("Desired Ratios for Undersampling:")
for class_label, ratio in undersampling_ratio.items():
    print(f"Class {class_label}: {ratio}")
# Adding the batches one by one 
batch_num = 0
resampled_batches = []
for X_batch, y_batch in zip(X_batches, y_batches):
    batch_num+=1
    print("Batch Number:",batch_num)
    X_resampled, y_resampled = resample_pipeline.fit_resample(X_batch, y_batch)
    resampled_batches.append((X_resampled, y_resampled))
    del X_resampled , y_resampled
    gc.collect()


balanced_data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns) for X_resampled, _ in resampled_batches])
balanced_data['CSAT_Score'] = pd.concat([pd.Series(y_resampled) for _, y_resampled in resampled_batches], ignore_index=True)
#Reverting to original values
reverted_categorical_cols = pd.DataFrame()
for col in non_numerical_cols:
    encoded_cols = [c for c in balanced_data.columns if c.startswith(col + '_')]
    reverted_col = balanced_data[encoded_cols].idxmax(axis=1).apply(lambda x: x.rsplit('_', 1)[-1])
    reverted_categorical_cols[col] = reverted_col

balanced_data = balanced_data.drop(columns=sum(original_to_encoded_cols.values(), []))
balanced_data = pd.concat([balanced_data, reverted_categorical_cols], axis=1)
balanced_data['sentiment_score'] = balanced_data['Customer_Remarks'].apply(calculate_sentiment)
conn = sqlite3.connect('balance_customer_support.db')
balanced_data.to_sql('balanced_customer_support', conn, if_exists='replace', index=False)
conn.close()

print("Balanced dataset saved to 'balance_customer_support.db'.")


Num of batches:3
Desired Ratios:
Class 2: 328
Class 3: 511
Class 4: 1389
Desired Ratios:
Class 1: 1337
Class 5: 4263
Batch Number: 1
Batch Number: 2


In [1]:
import sqlite3
import pandas as pd
conn = sqlite3.connect('balance_customer_support.db')
query = "SELECT CSAT_Score, COUNT(*) as count FROM balanced_customer_support GROUP BY CSAT_Score"
csat_distribution = pd.read_sql_query(query, conn)
conn.close()
print(csat_distribution)

   CSAT_Score  count
0           1   5885
1           2   1487
2           3   2169
3           4   4779
4           5   9164
