In [1]:
import os
import bz2 
import pandas as pd

In [2]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')      # For nltk<3.9.0
nltk.download('punkt_tab')  # For nltk>=3.9.0
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marcvicente/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marcvicente/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/marcvicente/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/marcvicente/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/marcvicente/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Parse Amazon Review raw files

In [3]:
path = "../raw_data"

In [4]:
def parse_raw_files(root_path:str, file_path_string:str):
    
    file_path = os.path.join(root_path, file_path_string)

    data = []
    
    with bz2.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ' , 1)
            if len(parts) == 2:
                label, text = parts
                label = label.replace('__label__', '')
                data.append((label, text))
                
    df = pd.DataFrame(data, columns=['label', 'text'])
    df['label'] = df['label'].astype(int)
    
    return df

In [18]:
train_path = '../raw_data/raw_train_data.csv'
test_path = '../raw_data/raw_test_data.csv'

In [19]:
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
    train_df.head()
    print("File found. Data loaded.")
else:
    # Your alternative logic here
    print("File not found. Performing direct read operation.")
    train_df = parse_raw_files(path,'train.ft.txt.bz2')
    train_df.to_csv('../raw_data/raw_train_data.csv', index=False)
    train_df.head()

File found. Data loaded.


In [22]:
if os.path.exists(test_path):
    test_df = pd.read_csv(test_path)
    print("File found. Data loaded.")
    test_df.head()
else:
    # Your alternative logic here
    print("File not found. Performing direct read operation.")
    test_df = parse_raw_files(path,'test.ft.txt.bz2')
    test_df.to_csv('../raw_data/raw_test_data.csv', index=False)
    test_df.head()

File found. Data loaded.


In [24]:
test_df.head()

Unnamed: 0,label,text
0,2,Great CD: My lovely Pat has one of the GREAT v...
1,2,One of the best game music soundtracks - for a...
2,1,Batteries died within a year ...: I bought thi...
3,2,"works fine, but Maha Energy is better: Check o..."
4,2,Great for the non-audiophile: Reviewed quite a...


In [7]:
print(train_df.loc[3,"text"])

Excellent Soundtrack: I truly like this soundtrack and I enjoy video game music. I have played this game and most of the music on here I enjoy and it's truly relaxing and peaceful.On disk one. my favorites are Scars Of Time, Between Life and Death, Forest Of Illusion, Fortress of Ancient Dragons, Lost Fragment, and Drowned Valley.Disk Two: The Draggons, Galdorb - Home, Chronomantique, Prisoners of Fate, Gale, and my girlfriend likes ZelbessDisk Three: The best of the three. Garden Of God, Chronopolis, Fates, Jellyfish sea, Burning Orphange, Dragon's Prayer, Tower Of Stars, Dragon God, and Radical Dreamers - Unstealable Jewel.Overall, this is a excellent soundtrack and should be brought by those that like video game music.Xander Cross


# Cleaning data

### (1.1) Remove punctuation

In [8]:
import string

def remove_punctuation(my_text):
    for punctuation in string.punctuation:
        my_text = my_text.replace(punctuation,'')
    return my_text

### (1.2) Lower case

In [9]:
def make_lower(my_text):
    if my_text is None:
        return None
    return my_text.lower()

### (1.3) Remove Numbers

In [10]:
def remove_numbers(my_text):
    my_text = ''.join([char for char in my_text if not char.isdigit()])
    return my_text

### (1.4) Remove Stopwords

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(my_text):
    
    stop_words = set(stopwords.words('english')) ## Limitation as some reviews are written in spanish
    word_tokens = word_tokenize(my_text)

    tokenized_sentence_no_stopwords = [word for word in word_tokens if not word in stop_words]
    return tokenized_sentence_no_stopwords

### (1.5) Lemmatizer

In [12]:
from nltk.stem import WordNetLemmatizer

def lemmatize_text(my_tokens_list):

    # Lemmatizing the verbs
    verb_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") # v --> verbs
        for word in my_tokens_list
    ]
    
    # 2 - Lemmatizing the nouns
    noun_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "n") # n --> nouns
        for word in verb_lemmatized
    ]

    return ' '.join(noun_lemmatized)

### (1.6) Preprocessing main function

In [13]:
def text_preprocessing(df):
    
    df['cleaned'] = df['text'].apply(remove_punctuation)

    df['cleaned'] = df['cleaned'].apply(make_lower)

    df['cleaned'] = df['cleaned'].apply(remove_numbers)

    df['cleaned'] = df['cleaned'].apply(remove_stopwords)

    df['cleaned'] = df['cleaned'].apply(lemmatize_text)

    return df

In [14]:
train_sample_df = train_df.sample(frac=0.1)

In [30]:
train_sample_df.head()

Unnamed: 0,label,text
2296910,2,Fantastic toy beagle!: While this beagle might...
2863948,2,Worked fine: Good training tool. We used it at...
835715,2,One of Grisham's best: This is the best that G...
3235453,2,"entertaining, not as deep as promised: I found..."
1114717,1,HORRIBLE PURCHASE: BEWARE!!! DO NOT PURCHASE T...


In [29]:
preproc_train_sample_df = train_sample_df.apply(text_preprocessing)
preproc_train_sample_df.info()

KeyError: 'text'