# Case 2: Predicting with rating


# Logistic Regression


## Data importing and cleaning


In [24]:
import os
import pandas as pd
from tqdm import tqdm
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

print("Current working directory: ", os.getcwd())
wd = os.getcwd()
# If current working directory is incorrect, use os.chdir("path")
# Edit the name of the data file from "Movie Review" into "MovieReview"
# and should be contained in the current working directory

Current working directory:  c:\Users\parma\OneDrive\Documents\y2s2\it1244


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def loadAllData(WD=os.getcwd()):
    '''
    WD: Current working directory containing datasets- os.getcwd() 
    RETURN: Pandas Dataframe
    '''
    # Initialise the dataframe
    columns = ["id", "rating", "category"]
    rows = []
    posWD = os.path.join(WD, "MovieReview/data/pos")
    negWD = os.path.join(WD, "MovieReview/data/neg")
    # Read positively classified text
    for filename in tqdm(os.listdir(posWD), desc='Processing Positive Reviews'):
        # Strip .txt from filename and split ID and Rating as an integer
        posDetails = filename.rstrip(".txt").split("_")
        posID, ratings = int(posDetails[0]), int(posDetails[1])
        # Add data into the Dataframe
        row = {"id": posID, "rating": ratings, "category": 1}
        rows.append(row)
    # Repeat for negative text
    for filename in tqdm(os.listdir(negWD), desc='Processing Negative Reviews'):
        negDetails = filename.rstrip(".txt").split("_")
        # Offset id by 25000 to prevent overlaps
        negID, ratings = int(negDetails[0]) + 25000, int(negDetails[1])
        row = {"id": negID, "rating": ratings, "category": 0}
        rows.append(row)
    return pd.DataFrame(rows, columns=columns)

In [20]:
dataframe = loadAllData()
print(dataframe.head())

Processing Positive Reviews: 100%|██████████| 25000/25000 [00:00<00:00, 1375759.01it/s]
Processing Negative Reviews: 100%|██████████| 25000/25000 [00:00<00:00, 1141232.68it/s]

      id  rating  category
0  10000       8         1
1  10001       7         1
2  10002       8         1
3  10003      10         1
4  10004       9         1





## Model


In [26]:
# Logistic regression
X = dataframe["rating"].values.reshape(-1, 1)
y = dataframe["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision:", precision)

Accuracy: 1.0
Precision: 1.0
