In [1]:

##Importing IMDB Dataset and cleaning reviews

#Importing libraries
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

#Importing dataset and replacing labels with 0 and 1 for classification
df = pd.read_csv('IMDBDataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

#Defining stop_words and lemmatizer
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

#Creating new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

  soup = BeautifulSoup(text, "html.parser")


In [2]:
# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

# Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

# Instantiate the Logistic Regression model (using the default parameters)
logreg = LogisticRegression()

# Fit the model with pre-processed data
logreg.fit(bow_train, y_train)

# Perform classification and prediction on samples in bow_test
predicted_logreg = logreg.predict(bow_test)

# Printing the classification report for the Logistic Regression model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, predicted_logreg))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      5035
           1       0.88      0.88      0.88      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
# Importing libraries
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating a Pipeline with TfidfVectorizer and LogisticRegression
pipeline = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 3), min_df=5)),
    ('logreg', LogisticRegression(max_iter=500, solver='saga', C=100))
])

# Defining hyperparameters for Grid Search (you can adjust these if you want to explore other parameters)
parameters = {
    # Since the main parameters are fixed in the pipeline, these are additional parameters you might want to explore.
    'vect__use_idf': [True, False],
    'logreg__penalty': ['l2', 'none']  # Note: 'saga' solver supports 'l1', 'l2', and 'none' penalties. Adjust as needed.
}

# Define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit=True, verbose=3, cv=cv)
grid_result = grid_search.fit(df.loc[:5000, 'Processed_Reviews'].values.astype('U'), df.loc[:5000, 'sentiment'].values.astype('U'))

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Fitting 30 folds for each of 4 candidates, totalling 120 fits
[CV 1/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.854 total time=   3.6s
[CV 2/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.836 total time=   3.5s
[CV 3/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.888 total time=   3.6s
[CV 4/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.836 total time=   3.4s
[CV 5/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.876 total time=   3.7s
[CV 6/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.860 total time=   3.6s
[CV 7/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.854 total time=   3.6s
[CV 8/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.862 total time=   3.6s
[CV 9/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.878 total time=   3.8s
[CV 10/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.858 total time=   3.8s
[CV 11/30] END logreg__penalty=l2, vect__use_idf=True;, score=0.872 total tim



[CV 1/30] END logreg__penalty=none, vect__use_idf=True;, score=0.856 total time=   5.3s




[CV 2/30] END logreg__penalty=none, vect__use_idf=True;, score=0.836 total time=   5.7s




[CV 3/30] END logreg__penalty=none, vect__use_idf=True;, score=0.880 total time=   5.5s




[CV 4/30] END logreg__penalty=none, vect__use_idf=True;, score=0.828 total time=   5.5s




[CV 5/30] END logreg__penalty=none, vect__use_idf=True;, score=0.874 total time=   5.6s




[CV 6/30] END logreg__penalty=none, vect__use_idf=True;, score=0.862 total time=   5.5s




[CV 7/30] END logreg__penalty=none, vect__use_idf=True;, score=0.846 total time=   5.5s




[CV 8/30] END logreg__penalty=none, vect__use_idf=True;, score=0.860 total time=   5.2s




[CV 9/30] END logreg__penalty=none, vect__use_idf=True;, score=0.876 total time=   5.2s




[CV 10/30] END logreg__penalty=none, vect__use_idf=True;, score=0.866 total time=   5.2s




[CV 11/30] END logreg__penalty=none, vect__use_idf=True;, score=0.866 total time=   5.2s




[CV 12/30] END logreg__penalty=none, vect__use_idf=True;, score=0.864 total time=   5.2s




[CV 13/30] END logreg__penalty=none, vect__use_idf=True;, score=0.876 total time=   5.1s




[CV 14/30] END logreg__penalty=none, vect__use_idf=True;, score=0.878 total time=   4.9s




[CV 15/30] END logreg__penalty=none, vect__use_idf=True;, score=0.870 total time=   5.0s




[CV 16/30] END logreg__penalty=none, vect__use_idf=True;, score=0.844 total time=   5.1s




[CV 17/30] END logreg__penalty=none, vect__use_idf=True;, score=0.868 total time=   5.5s




[CV 18/30] END logreg__penalty=none, vect__use_idf=True;, score=0.852 total time=   5.6s




[CV 19/30] END logreg__penalty=none, vect__use_idf=True;, score=0.838 total time=   5.6s




[CV 20/30] END logreg__penalty=none, vect__use_idf=True;, score=0.876 total time=   5.6s




[CV 21/30] END logreg__penalty=none, vect__use_idf=True;, score=0.838 total time=   5.4s




[CV 22/30] END logreg__penalty=none, vect__use_idf=True;, score=0.854 total time=   5.3s




[CV 23/30] END logreg__penalty=none, vect__use_idf=True;, score=0.882 total time=   5.3s




[CV 24/30] END logreg__penalty=none, vect__use_idf=True;, score=0.868 total time=   5.1s




[CV 25/30] END logreg__penalty=none, vect__use_idf=True;, score=0.854 total time=   5.2s




[CV 26/30] END logreg__penalty=none, vect__use_idf=True;, score=0.860 total time=   5.3s




[CV 27/30] END logreg__penalty=none, vect__use_idf=True;, score=0.862 total time=   5.3s




[CV 28/30] END logreg__penalty=none, vect__use_idf=True;, score=0.864 total time=   5.0s




[CV 29/30] END logreg__penalty=none, vect__use_idf=True;, score=0.846 total time=   5.4s




[CV 30/30] END logreg__penalty=none, vect__use_idf=True;, score=0.872 total time=   5.4s




[CV 1/30] END logreg__penalty=none, vect__use_idf=False;, score=0.828 total time=   5.5s




[CV 2/30] END logreg__penalty=none, vect__use_idf=False;, score=0.844 total time=   5.4s




[CV 3/30] END logreg__penalty=none, vect__use_idf=False;, score=0.864 total time=   5.4s




[CV 4/30] END logreg__penalty=none, vect__use_idf=False;, score=0.832 total time=   5.7s




[CV 5/30] END logreg__penalty=none, vect__use_idf=False;, score=0.866 total time=   5.4s




[CV 6/30] END logreg__penalty=none, vect__use_idf=False;, score=0.854 total time=   5.6s




[CV 7/30] END logreg__penalty=none, vect__use_idf=False;, score=0.828 total time=   5.2s




[CV 8/30] END logreg__penalty=none, vect__use_idf=False;, score=0.862 total time=   5.1s




[CV 9/30] END logreg__penalty=none, vect__use_idf=False;, score=0.858 total time=   5.2s




[CV 10/30] END logreg__penalty=none, vect__use_idf=False;, score=0.846 total time=   5.3s




[CV 11/30] END logreg__penalty=none, vect__use_idf=False;, score=0.850 total time=   5.3s




[CV 12/30] END logreg__penalty=none, vect__use_idf=False;, score=0.850 total time=   5.1s




[CV 13/30] END logreg__penalty=none, vect__use_idf=False;, score=0.854 total time=   5.1s




[CV 14/30] END logreg__penalty=none, vect__use_idf=False;, score=0.872 total time=   5.1s




[CV 15/30] END logreg__penalty=none, vect__use_idf=False;, score=0.878 total time=   5.3s




[CV 16/30] END logreg__penalty=none, vect__use_idf=False;, score=0.834 total time=   5.4s




[CV 17/30] END logreg__penalty=none, vect__use_idf=False;, score=0.856 total time=   5.5s




[CV 18/30] END logreg__penalty=none, vect__use_idf=False;, score=0.844 total time=   5.7s




[CV 19/30] END logreg__penalty=none, vect__use_idf=False;, score=0.842 total time=   5.6s




[CV 20/30] END logreg__penalty=none, vect__use_idf=False;, score=0.864 total time=   5.5s




[CV 21/30] END logreg__penalty=none, vect__use_idf=False;, score=0.836 total time=   5.7s




[CV 22/30] END logreg__penalty=none, vect__use_idf=False;, score=0.862 total time=   5.6s




[CV 23/30] END logreg__penalty=none, vect__use_idf=False;, score=0.876 total time=   5.5s




[CV 24/30] END logreg__penalty=none, vect__use_idf=False;, score=0.860 total time=   6.1s




[CV 25/30] END logreg__penalty=none, vect__use_idf=False;, score=0.830 total time=12.9min




[CV 26/30] END logreg__penalty=none, vect__use_idf=False;, score=0.848 total time=   6.9s




[CV 27/30] END logreg__penalty=none, vect__use_idf=False;, score=0.852 total time=   7.2s




[CV 28/30] END logreg__penalty=none, vect__use_idf=False;, score=0.858 total time=   6.1s




[CV 29/30] END logreg__penalty=none, vect__use_idf=False;, score=0.828 total time=   5.9s




[CV 30/30] END logreg__penalty=none, vect__use_idf=False;, score=0.866 total time=   5.7s
Best: 0.862962 using {'logreg__penalty': 'l2', 'vect__use_idf': True}
0.862962 (0.014623) with: {'logreg__penalty': 'l2', 'vect__use_idf': True}
0.855764 (0.014516) with: {'logreg__penalty': 'l2', 'vect__use_idf': False}
0.860563 (0.014144) with: {'logreg__penalty': 'none', 'vect__use_idf': True}
0.851432 (0.014300) with: {'logreg__penalty': 'none', 'vect__use_idf': False}
