# Data Exploration 

## Loading the data

In [6]:
import os
import pandas as pd

In [7]:
path = "../raw_data/"

In [29]:
file_path = os.path.join(path, "train_raw.csv")

data = pd.read_csv(file_path)


df = pd.DataFrame(data, columns=["label", "text"])
df["label"] = df["label"].astype(int)


df.head()

Unnamed: 0,label,text
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


In [10]:
file_path = os.path.join(path, "test_raw.csv")


data = pd.read_csv(file_path)

df_test = pd.DataFrame(data, columns=["label", "text"])
df_test["label"] = df_test["label"].astype(int)

In [11]:
df_test.head()


Unnamed: 0,label,text
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...


In [12]:
df.shape

(3600000, 2)

### Save data as csv locally

In [30]:
#df.to_csv("../raw_data/train_raw.csv", header=True, index=False)
#df_test.to_csv("../raw_data/test_raw.csv", header=True, index=False)

In [31]:
# Turn labels from 1 to 0 (bad) and 2 to 1 (good)
df.label = df['label'] - 1

In [37]:
#Create a small train df to speed up computations
df_train_small = df.sample(frac=0.01)

In [38]:
df_train_small.shape

(36000, 2)

In [39]:
df_train_small.label.value_counts()

label
0    18030
1    17970
Name: count, dtype: int64

In [40]:
#Create a small test df to speed up computations
df_test_small = df_test.sample(frac=0.01)

In [41]:
df_test_small.label.value_counts()

label
1    2013
2    1987
Name: count, dtype: int64

## Preprocessing 

In [42]:
import string
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [43]:
df_train_small.dtypes

label     int64
text     object
dtype: object

In [44]:
def clean_text(text):
    #No whitespaces in beginning or end
    text = text.strip()
    #lowercase
    text= text.lower()
    #remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

    # Tokenizing
    tokenized = word_tokenize(text)
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    text = " ".join(lemmatized)
    return text


In [45]:
df_train_small['clean_text'] = df_train_small['text'].apply(clean_text)
df.head(3)

Unnamed: 0,label,text
0,1,Stuning even for the non-gamer: This sound tra...
1,1,The best soundtrack ever to anything.: I'm rea...
2,1,Amazing!: This soundtrack is my favorite music...


In [46]:
X_train_small = df_train_small['clean_text']
y_train_small = df_train_small['label']

In [47]:
X_train_small_cleaned = df_train_small['clean_text']

In [54]:
df['clean_text'] = df['text'].apply(clean_text)

In [56]:
df.head(3)

Unnamed: 0,label,text,clean_text
0,1,Stuning even for the non-gamer: This sound tra...,stuning even for the nongamer this sound track...
1,1,The best soundtrack ever to anything.: I'm rea...,the best soundtrack ever to anything im readin...
2,1,Amazing!: This soundtrack is my favorite music...,amazing this soundtrack is my favorite music o...


In [55]:
# Save df containing clean text
df.to_csv("../raw_data/train_df_ml_clean.csv", header = True, index = False)