# Load Data

In [61]:
import pandas as pd

df = pd.read_csv('/workspaces/LogisticRegression/Dataset/cleaned_reviews.csv')

In [62]:
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0


# Analyse Data

In [63]:
df.shape

(17340, 4)

In [64]:
df.describe()

Unnamed: 0,cleaned_review_length,review_score
count,17340.0,17340.0
mean,30.300461,3.649077
std,35.83654,1.6735
min,0.0,1.0
25%,9.0,2.0
50%,20.0,5.0
75%,38.0,5.0
max,571.0,5.0


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17340 entries, 0 to 17339
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sentiments             17340 non-null  object 
 1   cleaned_review         17337 non-null  object 
 2   cleaned_review_length  17340 non-null  int64  
 3   review_score           17340 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 542.0+ KB


In [66]:
#check unique values of sentiments

df['sentiments'].unique()

array(['positive', 'neutral', 'negative'], dtype=object)

# Cleaning and preprocessing

In [67]:
# since i am gonna perfom a sentiment analysis task i will only keep the sentiments column (labels) and the cleaned_review column

# Extract two columns
df = df[['sentiments', 'cleaned_review']]

df.head()

Unnamed: 0,sentiments,cleaned_review
0,positive,i wish would have gotten one earlier love it a...
1,neutral,i ve learned this lesson again open the packag...
2,neutral,it is so slow and lags find better option
3,neutral,roller ball stopped working within months of m...
4,neutral,i like the color and size but it few days out ...


In [68]:
# Define the labels to keep
labels_to_keep = ['positive', 'negative']

# Filter the DataFrame to keep only 'positive' and 'negative' labels
df = df[df['sentiments'].isin(labels_to_keep)]

In [69]:
# Check for duplicates
duplicates = df[df.duplicated()]
print("Number of duplicate rows:", duplicates.shape[0])


Number of duplicate rows: 2373


In [70]:
# Remove duplicates
df.drop_duplicates(inplace=True)

In [71]:
import nltk
import re

# Download stop words
nltk.download('stopwords')
nltk.download('punkt')


# Define a list of stop words
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [72]:
# Define function for preprocessing text data
def preprocess_text(text):
    # Convert text to lower case
    text = text.lower()
    
    # Remove stop words
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove symbols and digits
    tokens = [re.sub(r'[^a-zA-Z]+', '', token) for token in tokens]
    
    # Remove words shorter than 3 characters
    tokens = [token for token in tokens if len(token) > 2]
    
    # Join tokens back into a string
    text = ' '.join(tokens)
    
    return text

In [73]:
df['cleaned_review'] = df['cleaned_review'].apply(preprocess_text)

In [74]:
df['cleaned_review'][0]

'wish would gotten one earlier love makes working laptop much easier'

In [75]:
df.head()

Unnamed: 0,sentiments,cleaned_review
0,positive,wish would gotten one earlier love makes worki...
5,positive,overall love mouse size weight clicking fabulo...
7,positive,son uses school issued chromebook school work ...
8,negative,loved cute little mouse broke months wheel bro...
9,negative,spent money get quality product thing stopped ...


In [76]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['cleaned_review'])

# Get the target labels
y = df['sentiments']

In [77]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression Model

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Create a LogisticRegression object
clf = LogisticRegression(penalty='l2',max_iter=3000)


# Define the hyperparameters to tune
hyperparameters = {
    'C': [0.1, 1, 10]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(clf, hyperparameters)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

{'C': 1}


In [79]:
# Fit the model to the data
clf.fit(X_train, y_train)

In [80]:
from sklearn.metrics import accuracy_score

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.9486439699942296


In [81]:
# Make predictions on new reviews
new_reviews = ["This movie was great!", "This movie was terrible!"]
new_reviews_numerical = vectorizer.transform(new_reviews)
new_reviews_pred = clf.predict(new_reviews_numerical)

print(new_reviews_pred)

['positive' 'negative']


In [None]:
import pickle


# Save the trained model to a specific path
with open('/workspaces/LogisticRegression/Model/model.pkl', 'wb') as f:
    pickle.dump(clf, f)