# Data Exploration 

## Loading the data

In [58]:
import os
import pandas as pd

## Load cleaned training set

In [59]:
path = "../raw_data/"

In [62]:
file_path = os.path.join(path, "train_df_ml_clean.csv")

data = pd.read_csv(file_path)


df = pd.DataFrame(data, columns=["label", "text",'clean_text'])
df["label"] = df["label"].astype(int)


df.head()

Unnamed: 0,label,text,clean_text
0,1,Stuning even for the non-gamer: This sound tra...,stuning even for the nongamer this sound track...
1,1,The best soundtrack ever to anything.: I'm rea...,the best soundtrack ever to anything im readin...
2,1,Amazing!: This soundtrack is my favorite music...,amazing this soundtrack is my favorite music o...
3,1,Excellent Soundtrack: I truly like this soundt...,excellent soundtrack i truly like this soundtr...
4,1,"Remember, Pull Your Jaw Off The Floor After He...",remember pull your jaw off the floor after hea...


In [63]:
df.shape

(3600000, 3)

In [None]:
#This was accidently done twice. The labels are not -1 and 0 (does not affect model)
# Turn labels from 1 to 0 (bad) and 2 to 1 (good)
df.label = df['label'] - 1

## Vectorize data and safe tfidf matrix locally

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [67]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)

In [110]:
#Cleaning leads to some nan values
df = df.dropna()

In [70]:
X_train_full = vectorizer.fit_transform(df['clean_text'])

In [72]:
import joblib
joblib.dump(X_train_full, '../preprocessing_pipelines/tfidf_matrix.pkl')

['../preprocessing_pipelines/tfidf_matrix.pkl']

In [73]:
X_train_full.shape

(3599990, 10000)

## Create and train a model on 360k rows (10% of data)

In [74]:
X_train_reduced = X_train_full[:360000,:]

In [75]:
y_train_reduced = df.iloc[:360000,0]
y_train_reduced

0         0
1         0
2         0
3         0
4         0
         ..
359996   -1
359997   -1
359998   -1
359999    0
360000   -1
Name: label, Length: 360000, dtype: int64

In [None]:
# Implement Logistic Regression with optimized params on 10% training set
max_features = 10000
logreg_C = 10

model = LogisticRegression(C=logreg_C, max_iter=1000)

model.fit(X_train_reduced, y_train_reduced)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [77]:
X_train_full[360001,:]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 44 stored elements and shape (1, 10000)>

In [78]:
df.iloc[360001,0]

np.int64(-1)

In [79]:
model.predict(X_train_full[360001,:])

array([-1])

## Training on more data

In [None]:
#Splitting whole (vectorized) data into train and test
index = round(len(df) * 0.8)
X_train_final = X_train_full[:index,:]
X_test_final = X_train_full[index:,:]

y_train_final = df.iloc[:index,0]
y_test_final = df.iloc[index:,0]
index

2879992

In [86]:
# Implement Logistic Regression with optimized params and learning curves on large training set
logreg_C = 10

model = LogisticRegression(C=logreg_C, max_iter=1000)

model.fit(X_train_final, y_train_final)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [111]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [88]:
y_pred = model.predict(X_test_final)
accuracy_score(y_test_final,y_pred)

0.9223803399453887

### Train a model on all 3.6M rows

In [125]:
logreg_C = 10

model = LogisticRegression(C=logreg_C, max_iter=1000)

model.fit(X_train_full, df['label'])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


## Test the model on the real test data

In [89]:
test_df = pd.read_csv('../raw_data/raw_test_data.csv')

In [135]:
import string
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [136]:
def clean_text(text):
    #No whitespaces in beginning or end
    text = text.strip()
    #lowercase
    text= text.lower()
    #remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

    # Tokenizing
    tokenized = word_tokenize(text)
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    text = " ".join(lemmatized)
    return text

In [137]:
#Clean and vectorize test data
test_df['clean_text'] = test_df['text'].apply(clean_text)

KeyboardInterrupt: 

In [None]:
#Adjust labels. Maybe you have to execute this cell twice
test_df['label'] = test_df['label'] -1

In [99]:
test_df.head(3)

Unnamed: 0,label,text,clean_text
0,0,Great CD: My lovely Pat has one of the GREAT v...,great cd my lovely pat ha one of the great voi...
1,0,One of the best game music soundtracks - for a...,one of the best game music soundtrack for a ga...
2,-1,Batteries died within a year ...: I bought thi...,battery died within a year i bought this charg...


In [None]:
X_test = vectorizer.transform(test_df['clean_text'])

In [126]:
#Use previously trained model to predict test labels
y_test = test_df['label']
y_pred = model.predict(X_test)

In [127]:
accuracy_score(y_test, y_pred)

0.9214275

In [128]:
#Adjust labels to test precision and recall

y_pred = y_pred + 1

In [129]:
#Adjusting test labels
y_test = y_test +1

In [130]:
precision_score(y_test, y_pred), recall_score(y_test, y_pred)

(0.9199304479530473, 0.92321)

In [105]:
#Save vectorized test set
joblib.dump(X_test, '../preprocessing_pipelines/test_tfidf_matrix.pkl')

['../preprocessing_pipelines/test_tfidf_matrix.pkl']

### Saving model locally

In [131]:
from datetime import datetime

In [132]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
model_path = f"../model/logreg_full_model_{timestamp}.pkl"
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

Model saved to: ../model/logreg_full_model_20250610_1746.pkl


In [133]:
X_train_full.shape

(3599990, 10000)

In [134]:
df['label'].shape

(3599990,)

In [1]:
import dill
import pickle

In [2]:
import pandas as pd
import numpy as np
import os

#Basic cleaning
import string
import re

#ML tokenizing, lemmatizing and vectorizing
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
def preprocess_series(X):
        """
        Helper function for pipeline
        This function always has to be available in the namespace when loading
        the pipeline.
        """
        def basic_cleaning(text):
            """
            Function which takes a string and cleans it to get the string ready for
            future preprocessing. This is a universal step which the data will always
            undergo.
            Input: String
            Output: String
            """

            #No whitespaces in beginning or end
            text = text.strip()

            #lowercase
            text= text.lower()

            #remove numbers
            text = re.sub(r'\b\d+\b', '', text)

            #Removing punctuation
            text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

            # Tokenizing
            tokenized = word_tokenize(text)

            # Lemmatizing
            lemmatizer = WordNetLemmatizer()
            lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]

            text = " ".join(lemmatized)
            return text

        return[basic_cleaning(text) for text in X]


In [4]:
with open("/Users/johannesb/code/Jojo2813/SentiScope/preprocessing_pipelines/preproc_pipeline_ml.pkl",'rb') as f:
    pipe = pickle.load(f)

In [5]:
# Ensure full recursive serialization
dill.settings['recurse'] = True

In [6]:
with open("/Users/johannesb/code/Jojo2813/SentiScope/model/preproc_pipeline_ml_2.pkl", "wb") as f:
    dill.dump(pipe, f)