In [22]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.model_selection import cross_val_score , StratifiedKFold
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

In [2]:
data = pd.read_pickle('..\Data\After_ExtractingFeatures.pkl')

In [3]:
data.head()

Unnamed: 0,feedback,review_len,variation_0,variation_1,variation_2,variation_3,variation_4,0,1,2,...,3629,3630,3631,3632,3633,3634,3635,3636,3637,3638
0,1,13,0,0,0,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,9,0,0,0,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,195,0,0,0,1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,172,0,0,0,0,1,0.0,0.0,0.0,...,0.352405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,5,0,0,0,0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
y = data['feedback']
X = data.drop('feedback', axis=1)

## Splitting Data into Training, Validation and Test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 , random_state=42 , stratify=y)

In [6]:
X_train.shape[0] , X_test.shape[0]

(1948, 487)

In [7]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



In [8]:
X_train_resampled.shape[0] , y_train_resampled.shape[0]

(3542, 3542)

In [9]:
model = LogisticRegression()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')
model.fit(X_train_resampled, y_train_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [10]:
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.90985915 0.90985915 0.91525424 0.88700565 0.9039548  0.88983051
 0.90677966 0.89548023 0.93220339 0.92090395]
Mean Accuracy: 0.9071130739237686


In [11]:
y_pred = model.predict(X_test)



In [12]:
accuracy_score(y_test, y_pred)

0.8685831622176592

In [13]:
confusion_matrix(y_test, y_pred)

array([[ 25,  19],
       [ 45, 398]], dtype=int64)

In [14]:
classification_report(y_test, y_pred).split('\n')

['              precision    recall  f1-score   support',
 '',
 '           0       0.36      0.57      0.44        44',
 '           1       0.95      0.90      0.93       443',
 '',
 '    accuracy                           0.87       487',
 '   macro avg       0.66      0.73      0.68       487',
 'weighted avg       0.90      0.87      0.88       487',
 '']

### Applying XGBoost Model

In [15]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [16]:
xgb_model = XGBClassifier()
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

In [17]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [18]:
y_pred_XGboost = best_model.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred_XGboost)

0.9014373716632443

In [20]:
confusion_matrix(y_test, y_pred_XGboost)

array([[ 12,  32],
       [ 16, 427]], dtype=int64)

In [21]:
classification_report(y_test, y_pred_XGboost).split('\n')

['              precision    recall  f1-score   support',
 '',
 '           0       0.43      0.27      0.33        44',
 '           1       0.93      0.96      0.95       443',
 '',
 '    accuracy                           0.90       487',
 '   macro avg       0.68      0.62      0.64       487',
 'weighted avg       0.88      0.90      0.89       487',
 '']

## Saving the models

In [23]:
model_name = 'xgboost_model.joblib'
joblib.dump(best_model, model_name)


['xgboost_model.joblib']

In [24]:
model_name = 'logisticRegression_model.joblib'
joblib.dump(model, model_name)

['logisticRegression_model.joblib']