Reading the Cleaned Dataset 

In [111]:
#importing necessary libraries
import pandas as pd
import numpy as np

In [112]:
df = pd.read_csv('preprocessing_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,grew b 1965 watching loving thunderbirds mates...,0
1,1,put movie dvd player sat coke chips expectatio...,0
2,2,people know particular time past like feel nee...,0
3,3,even though great interest biblical movies bor...,0
4,4,im die hard dads army fan nothing ever change ...,1


In [113]:
df = df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,text,label
0,grew b 1965 watching loving thunderbirds mates...,0
1,put movie dvd player sat coke chips expectatio...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movies bor...,0
4,im die hard dads army fan nothing ever change ...,1


In [114]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [115]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [116]:
df['text']=df['text'].apply(lambda x:lemmatize_words(x))

In [117]:
df.head()

Unnamed: 0,text,label
0,grew b 1965 watching loving thunderbird mate s...,0
1,put movie dvd player sat coke chip expectation...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movie bore...,0
4,im die hard dad army fan nothing ever change g...,1


Train-Test Split

In [118]:
# Define features (X) and target variable (y)
X = df.drop('label', axis=1)
y = df['label']

In [119]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

Using word2vec

In [120]:

import nltk
nltk.download('punkt_tab')

from nltk import sent_tokenize
from gensim.utils import simple_preprocess



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\allen.harry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [121]:
from nltk.tokenize import word_tokenize

# Tokenize text data
X_train_tokenized = X_train.apply(lambda x: word_tokenize(str(x)))
X_test_tokenized = X_test.apply(lambda x: word_tokenize(str(x)))

In [122]:
X_train_tokenized.shape,y_train.shape

((32000,), (32000,))

In [123]:
from gensim.models import Word2Vec
# Create Word2Vec model
w2v_model = Word2Vec([row for row in X_train_tokenized.values], vector_size=100, window=5, min_count=1)

In [124]:
# Define Avg Word2Vec function
def avg_word2vec(sentence):
    vec = []
    for word in sentence:
        if word in w2v_model.wv:
            vec.append(w2v_model.wv[word])
    if vec:  # Check if vec is not empty
        return np.mean(np.array(vec), axis=0)
    else:
        return np.zeros(100)  # Return zero vector if no words in sentence

In [125]:
# Vectorize text data using Avg Word2Vec
X_train_avg_w2v = np.array([avg_word2vec(row) for row in X_train_tokenized.values])
X_test_avg_w2v = np.array([avg_word2vec(row) for row in X_test_tokenized.values])

In [126]:
X_train_avg_w2v.shape,X_test_avg_w2v.shape

((32000, 100), (8000, 100))

In [127]:
print(X_train_avg_w2v.shape, y_train.shape)

(32000, 100) (32000,)


In [128]:
print(X_test_avg_w2v.shape, y_test.shape)

(8000, 100) (8000,)


In [142]:
import pickle

# Save Word2Vec model
with open('w2v_model.pkl', 'wb') as f:
    pickle.dump(w2v_model, f)

Model Training

In [129]:
#Importing all the algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [130]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Adaboost":AdaBoostClassifier(),
    "Xgboost":XGBClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_avg_w2v, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train_avg_w2v)
    y_test_pred = model.predict(X_test_avg_w2v)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set
- Accuracy: 0.8512
- F1 score: 0.8512
- Precision: 0.8459
- Recall: 0.8575
- Roc Auc Score: 0.8512
----------------------------------
Model performance for Test set
- Accuracy: 0.8474
- F1 score: 0.8473
- Precision: 0.8406
- Recall: 0.8604
- Roc Auc Score: 0.8473


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7145
- F1 score: 0.7145
- Precision: 0.7160
- Recall: 0.7189
- Roc Auc Score: 0.7145


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8164
- F1 score: 0.8163
- Precision: 0.8115
- Recall: 0.8282
- Roc Auc Score: 0.8163


Gradient Boost
Model performance for Training se



Adaboost
Model performance for Training set
- Accuracy: 0.8170
- F1 score: 0.8170
- Precision: 0.8156
- Recall: 0.8177
- Roc Auc Score: 0.8170
----------------------------------
Model performance for Test set
- Accuracy: 0.8059
- F1 score: 0.8059
- Precision: 0.8090
- Recall: 0.8052
- Roc Auc Score: 0.8059


Xgboost
Model performance for Training set
- Accuracy: 0.9745
- F1 score: 0.9745
- Precision: 0.9749
- Recall: 0.9740
- Roc Auc Score: 0.9745
----------------------------------
Model performance for Test set
- Accuracy: 0.8380
- F1 score: 0.8380
- Precision: 0.8337
- Recall: 0.8478
- Roc Auc Score: 0.8379




In [140]:
import pickle

# Train Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_avg_w2v, y_train)

# Save model to file
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(logreg_model, f)

Prediction

In [144]:
import pickle
import numpy as np

def avg_word2vec(sentence, model):
    vec = []
    for word in sentence:
        if word in model.wv:
            vec.append(model.wv[word])
    if vec:
        return np.mean(np.array(vec), axis=0)
    else:
        return np.zeros(100)

def predict_sentence(model_file, w2v_model_file, input_sentence):
    with open(model_file, 'rb') as f:
        loaded_logreg_model = pickle.load(f)
    with open(w2v_model_file, 'rb') as f:
        loaded_w2v_model = pickle.load(f)
        
    input_tokenized = word_tokenize(input_sentence.lower())
    input_vector = avg_word2vec(input_tokenized, loaded_w2v_model)
    input_vector = np.array([input_vector])  # Reshape for prediction
    
    prediction = loaded_logreg_model.predict(input_vector)
    probability = loaded_logreg_model.predict_proba(input_vector)
    
    return prediction, probability

input_sentence = "My Super Ex Girlfriend is a fun movie that you shouldn't really take seriously, it's just a cute romantic comedy that I think if I could get a laugh out of it, anyone could"
model_file = 'logistic_regression_model.pkl'
w2v_model_file = 'w2v_model.pkl'

prediction, probability = predict_sentence(model_file, w2v_model_file, input_sentence)
print("Prediction:", prediction)
print("Probability:", probability)

Prediction: [1]
Probability: [[0.22586499 0.77413501]]


Hyper Parameter Tunning

In [137]:
## Gradient params

gradient_params={
             "criterion": ['friedman_mse','squared_error','mse'],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500],
              "max_depth": [5, 8, 15, None, 10]}



In [138]:
# Models list for Hyperparameter tuning
randomcv_models = [("Gradient", GradientBoostingClassifier(), gradient_params)]

In [139]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


ValueError: 
All the 300 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
44 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of GradientBoostingClassifier must be a str among {'friedman_mse', 'squared_error'}. Got 'mse' instead.

--------------------------------------------------------------------------------
46 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of GradientBoostingClassifier must be a str among {'squared_error', 'friedman_mse'}. Got 'mse' instead.

--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\ensemble\_gb.py", line 659, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\pandas\core\series.py", line 1031, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'know girl figure skating lead girl dated brother always really nice also live cranbrook bc 15 minute fort steele haha used go field trip elementary school kinda weird seeing movie also chance movie filming extra casting call mallbut didnt feel like going time wasnt interested acting totally wish awesome movie bought ebay never came kind weird seeing partially filmed excited came really loved story line poler bear kinda cutebut anyone question fort steele ask away'

--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\ensemble\_gb.py", line 659, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\pandas\core\series.py", line 1031, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'watched last night morning - thats much liked something movie movie almost cry would strongly recommend latter day friend - definitely worth seeing agree say part movie look realistic example main character totally cute perfect physical shape although round also type shape rarely meet people like single never met couple part movie including coincidence look realistic well movie life story'

--------------------------------------------------------------------------------
52 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'loss' parameter of GradientBoostingClassifier must be a str among {'log_loss', 'exponential'}. Got 'deviance' instead.

--------------------------------------------------------------------------------
23 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Projects\Movie_Sentimental_NLP\menv1\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'loss' parameter of GradientBoostingClassifier must be a str among {'exponential', 'log_loss'}. Got 'deviance' instead.
