In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re


In [3]:
# Define the paths to the dataset files
import os


base_dataset_path = '/home/joel/Downloads/archive (2)'
train_dataset_filename = 'train.ft.txt'
test_dataset_filename = 'test.ft.txt'

# Construct the full paths to the files
train_dataset_path = os.path.join(base_dataset_path, train_dataset_filename)
test_dataset_path = os.path.join(base_dataset_path, test_dataset_filename)

# Load a subset of the dataset into a Pandas DataFrame for initial exploration
def load_dataset_subset(file_path, num_lines=10000):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = [next(file) for _ in range(num_lines)]
    return pd.DataFrame(lines, columns=['review'])

train_df = load_dataset_subset(train_dataset_path)
test_df = load_dataset_subset(test_dataset_path)


In [4]:
print("Training Data Sample:")
print(train_df.head())

print("Testing Data Sample:")
print(test_df.head())


Training Data Sample:
                                              review
0  __label__2 Stuning even for the non-gamer: Thi...
1  __label__2 The best soundtrack ever to anythin...
2  __label__2 Amazing!: This soundtrack is my fav...
3  __label__2 Excellent Soundtrack: I truly like ...
4  __label__2 Remember, Pull Your Jaw Off The Flo...
Testing Data Sample:
                                              review
0  __label__2 Great CD: My lovely Pat has one of ...
1  __label__2 One of the best game music soundtra...
2  __label__1 Batteries died within a year ...: I...
3  __label__2 works fine, but Maha Energy is bett...
4  __label__2 Great for the non-audiophile: Revie...


In [5]:
# Define the clean_text function as previously described
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # replace multiple whitespaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

# Define the preprocess_review function
def preprocess_review(line):
    label = line.split(' ')[0]
    label = 1 if label == '__label__2' else 0  # Convert labels to 0 or 1
    text = ' '.join(line.split(' ')[1:])  # Extract the text
    text = clean_text(text)  # Clean the text
    return label, text

# Apply the preprocessing to the datasets
train_df['label'], train_df['cleaned_text'] = zip(*train_df['review'].apply(preprocess_review))
test_df['label'], test_df['cleaned_text'] = zip(*test_df['review'].apply(preprocess_review))


In [6]:
print("Training Data after Preprocessing:")
print(train_df.head())

print("Testing Data after Preprocessing:")
print(test_df.head())


Training Data after Preprocessing:
                                              review  label  \
0  __label__2 Stuning even for the non-gamer: Thi...      1   
1  __label__2 The best soundtrack ever to anythin...      1   
2  __label__2 Amazing!: This soundtrack is my fav...      1   
3  __label__2 Excellent Soundtrack: I truly like ...      1   
4  __label__2 Remember, Pull Your Jaw Off The Flo...      1   

                                        cleaned_text  
0  stuning even for the nongamer this sound track...  
1  the best soundtrack ever to anything im readin...  
2  amazing this soundtrack is my favorite music o...  
3  excellent soundtrack i truly like this soundtr...  
4  remember pull your jaw off the floor after hea...  
Testing Data after Preprocessing:
                                              review  label  \
0  __label__2 Great CD: My lovely Pat has one of ...      1   
1  __label__2 One of the best game music soundtra...      1   
2  __label__1 Batteries died with

In [7]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit the number of features to the top 5000

# Fit the vectorizer on the training data and transform the training data
X_train = vectorizer.fit_transform(train_df['cleaned_text'])

# Transform the testing data
X_test = vectorizer.transform(test_df['cleaned_text'])

# Check the shape of the resulting feature matrices
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (10000, 5000)
X_test shape: (10000, 5000)


In [8]:
# The labels are already prepared in the 'label' column
y_train = train_df['label']
y_test = test_df['label']

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# You can save the model here if you like, but it's not necessary for the presentation


In [9]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the predictions
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.85      0.86      4875
           1       0.86      0.87      0.87      5125

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Accuracy: 0.8633
