In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import pickle


In [2]:
import pandas as pd

# Load training data
train_df = pd.read_csv("train.csv")

# Load testing data
test_df = pd.read_csv("test.csv")

# Load validation data
valid_df = pd.read_csv("valid.csv")

# Display the first few rows of each dataset to verify they're loaded correctly
print("Training Data:")
print(train_df.head())

print("\nTesting Data:")
print(test_df.head())

print("\nValidation Data:")
print(valid_df.head())


Training Data:
                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1

Testing Data:
                                                text  label
0  I always wrote this series off as being a comp...      0
1  1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...      0
2  This movie was so poorly written and directed ...      0
3  The most interesting thing about Miryang (Secr...      1
4  when i first read about "berlin am meer" i did...      0

Validation Data:
                                                text  label
0  It's been about 14 years since Sharon Stone aw...      0
1  someone needed to make a car payment... this i...      0
2  The Guidelines state that a comment must conta...

In [3]:
print("Training Data Label Distribution:")
print(train_df['label'].value_counts())

print("\nTesting Data Label Distribution:")
print(test_df['label'].value_counts())

print("\nValidation Data Label Distribution:")
print(valid_df['label'].value_counts())


Training Data Label Distribution:
label
0    20019
1    19981
Name: count, dtype: int64

Testing Data Label Distribution:
label
1    2505
0    2495
Name: count, dtype: int64

Validation Data Label Distribution:
label
1    2514
0    2486
Name: count, dtype: int64


In [4]:
train_df['review_length'] = train_df['text'].apply(len)
test_df['review_length'] = test_df['text'].apply(len)
valid_df['review_length'] = valid_df['text'].apply(len)

print("Training Data Review Length Distribution:")
print(train_df['review_length'].describe())

print("\nTesting Data Review Length Distribution:")
print(test_df['review_length'].describe())

print("\nValidation Data Review Length Distribution:")
print(valid_df['review_length'].describe())


Training Data Review Length Distribution:
count    40000.000000
mean      1310.293250
std        988.358599
min         32.000000
25%        698.000000
50%        973.000000
75%       1596.000000
max      13704.000000
Name: review_length, dtype: float64

Testing Data Review Length Distribution:
count     5000.000000
mean      1314.596200
std       1010.339949
min         67.000000
25%        706.000000
50%        970.000000
75%       1578.500000
max      12930.000000
Name: review_length, dtype: float64

Validation Data Review Length Distribution:
count    5000.00000
mean     1297.36800
std       979.91039
min        52.00000
25%       698.00000
50%       957.00000
75%      1560.25000
max      9345.00000
Name: review_length, dtype: float64


In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Perform stemming
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    # Perform lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# Load your training, testing, and validation datasets
# Replace 'path_to_your_datasets' with the actual path to your datasets
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")
valid_df = pd.read_csv("Valid.csv")

# Apply preprocessing to each review in the datasets
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)
valid_df['preprocessed_text'] = valid_df['text'].apply(preprocess_text)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit-transform the training data and transform the testing and validation data
X_train = tfidf_vectorizer.fit_transform(train_df['preprocessed_text'])
X_test = tfidf_vectorizer.transform(test_df['preprocessed_text'])
X_valid = tfidf_vectorizer.transform(valid_df['preprocessed_text'])

# Extract labels
y_train = train_df['label']
y_test = test_df['label']
y_valid = valid_df['label']

# Initialize logistic regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predictions on testing and validation data
y_pred_test = logreg.predict(X_test)
y_pred_valid = logreg.predict(X_valid)

# Evaluate performance
test_accuracy = accuracy_score(y_test, y_pred_test)
valid_accuracy = accuracy_score(y_valid, y_pred_valid)

print("Testing Accuracy:", test_accuracy)
print("Validation Accuracy:", valid_accuracy)

print("\nClassification Report for Testing Data:")
print(classification_report(y_test, y_pred_test))

print("\nClassification Report for Validation Data:")
print(classification_report(y_valid, y_pred_valid))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mubva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mubva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mubva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Testing Accuracy: 0.8934
Validation Accuracy: 0.8918

Classification Report for Testing Data:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2495
           1       0.88      0.91      0.89      2505

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000


Classification Report for Validation Data:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      2486
           1       0.88      0.91      0.89      2514

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

