In [6]:
import os
import pandas as pd
import dvc.api
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import os
import pandas as pd
import numpy as np
import json
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
import dvc.api
import mlflow

def handle_nan(df):
    df = df.drop('style', axis=1)
    df = df.drop('image', axis=1)
    df = df.drop('reviewerName', axis=1)
    df['reviewText'].fillna('', inplace=True)
    df['vote'].fillna(0, inplace=True)
    df['summary'].fillna('', inplace=True)
    return df
    
def add_features(df):
    df['reviewTextLength'] = df['reviewText'].astype(str).apply(len)
    df['numbers_amount'] = df['reviewText'].str.count(r'\d+')
    return df
    
def to_categorical(df):
    one_hot_data_origin = pd.get_dummies(df['data_origin'], prefix='origin_')
    df = pd.concat([df, one_hot_data_origin], axis=1)
    df = df.drop('data_origin', axis=1)
    return df
    
def drop_useless(df):
    df = df.drop('asin', axis=1)
    df = df.drop('reviewTime', axis=1)
    df = df.drop('reviewerID', axis=1)
    return df
    
def standarize(df):
    X = df[['vote', 'numbers_amount', 'unixReviewTime', 'reviewTextLength']].values
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    df[['vote', 'numbers_amount', 'unixReviewTime', 'reviewTextLength']] = X_std
    return df
    
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emotikony uśmiechnięte
                               u"\U0001F300-\U0001F5FF"  # symbole
                               u"\U0001F680-\U0001F6FF"  # emotikony transportu
                               u"\U0001F1E0-\U0001F1FF"  # flagi krajów
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def normalize_text(df, text_column_name):
    nltk.download('wordnet')
    nltk.download('stopwords')
    nltk.download('omw-1.4')
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    column_name = 'tmp'
    df[column_name] = df[text_column_name].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
    df[column_name] = df[column_name].apply(lambda x: word_tokenize(x.lower()))
    df[column_name] = df[column_name].apply(lambda x: [word for word in x if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    df[column_name] = df[column_name].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    df[column_name] = df[column_name].apply(lambda x: [word for word in x if word.isalnum()])
    df[column_name] = df[column_name].apply(lambda x: ' '.join(x))
    vectorizer = CountVectorizer(binary=True, max_df= 0.75, min_df = 2, max_features=1000)
    count_matrix = vectorizer.fit_transform(df[column_name])
    count_array = count_matrix.toarray()
    df_words = pd.DataFrame(data=count_array, columns = vectorizer.get_feature_names_out())
    df_words = df_words.add_prefix(text_column_name+'_')
    df = pd.concat([df, df_words], axis=1)
    df = df.drop(column_name, axis=1)
    return df

def clean_text(df):
    df = normalize_text(df, 'reviewText')
    df = normalize_text(df, 'summary')
    return df
    
def to_float(df):
    df['vote'] = df['vote'].apply(lambda x: x.replace(',', '.') if isinstance(x, str) else x)
    return df

def extract(df):
    df = to_float(df)
    df = drop_useless(df)
    df = handle_nan(df)
    df = add_features(df)
    df = to_categorical(df)
    df = standarize(df)
    df = clean_text(df)
    return df



In [7]:
params = dvc.api.params_show()

start_file_train = 'data/train/train_raw.json'
start_file_test = 'data/test/test_raw.json'

df = pd.read_json(start_file_train, lines=True)
df = df.head(1000)
df = extract(df)

[nltk_data] Downloading package wordnet to /home/kaka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kaka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kaka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kaka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kaka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kaka/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
df

Unnamed: 0,overall,verified,reviewText,summary,unixReviewTime,vote,reviewTextLength,numbers_amount,origin__AMAZON_FASHION_5.json,origin__All_Beauty_5.json,...,summary_one,summary_product,summary_really,summary_software,summary_star,summary_use,summary_well,summary_work,summary_worked,summary_worth
0,5,False,I purchased Pinnacle Studio 18 Ultimate and ha...,Much Better than Studio 18,0.267124,-0.199730,-0.330005,-0.123903,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,True,"It's Bare Escentuals, 'nuff said!",Five Stars,0.983068,-0.199730,-0.621682,-0.413059,0,1,...,0,0,0,0,1,0,0,0,0,0
2,5,False,"I use this on my iMac to make video DVD's, bur...",Great Basic DVD/CD burning and copying applica...,-0.538202,-0.123055,0.161058,-0.413059,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,False,MS manage to change the Interface again. For ...,Another POS from MS,-3.948891,-0.097497,-0.086215,-0.268481,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,True,love this product! ran out of face wash on a t...,body & face,0.066235,-0.199730,-0.553769,-0.413059,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3,False,I happen to like Microsoft as a company. I go ...,A step backward for Microsoft and Office,-2.178058,2.483905,1.895450,2.044765,0,0,...,0,0,0,0,0,0,0,0,0,0
996,1,True,"Unfortunately, this Flex Supreme does NOT have...",Did not fit my feet,0.744125,-0.199730,-0.522425,-0.413059,1,0,...,0,0,0,0,0,0,0,0,0,0
997,4,False,"I will be honest, I'm not a big MAC person. Ye...",Meets Your Basic Office Needs,-1.582471,-0.199730,0.133196,-0.413059,0,0,...,0,0,0,0,0,0,0,0,0,0
998,5,True,Great product - my wife loves it,Five Stars,0.154732,-0.199730,-0.622553,-0.413059,0,1,...,0,0,0,0,1,0,0,0,0,0


In [8]:
def get_text_only(df):
    df_text_only = df.filter(regex='^reviewText_|^overall|^summary_')
    df_text_only.name = 'text_only'
    return df_text_only

def get_not_text(df):
    df_not_text = df.filter(regex='^origin_|^overall|^vote|^numbers_amount|^verified|^reviewTextLength|^unixReviewTime')
    df_not_text.name = 'not_text'
    return df_not_text


df_1 = get_text_only(df)
df_2 = get_not_text(df)
dummy = DummyClassifier(random_state=1)
RF_model = RandomForestClassifier(random_state=1)
SVM_model = LinearSVC(random_state=1, max_iter=100000)
results = cross_validate(RF_model, df.drop(['overall', 'reviewText', 'summary'],axis=1), df['overall'], cv=5, 
                         scoring=['f1_micro', 'accuracy'])

In [9]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.1)

In [10]:
selector.fit_transform(df_2.drop('overall', axis=1))

array([[ 0.        ,  0.44077433, -0.19955576, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        , -1.8567081 , -0.19955576, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -0.46197105, -0.19955576, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.        , -0.72028949, -0.19955576, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.        , -1.99180043,  0.16498076, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -0.19817589, -0.19955576, ...,  0.        ,
         1.        ,  0.        ]])

In [46]:
results

{'fit_time': array([0.14909601, 0.15661883, 0.15390158, 0.13915825, 0.1435194 ]),
 'score_time': array([0.0140729 , 0.0155642 , 0.01106143, 0.01191902, 0.01455569]),
 'test_f1_micro': array([0.59 , 0.595, 0.62 , 0.59 , 0.62 ]),
 'test_accuracy': array([0.59 , 0.595, 0.62 , 0.59 , 0.62 ])}

In [5]:
import os

os.chdir('/home/kaka/studia/pdow/summer-23-lab3-6-KunickiKarol')
!pwd


/home/kaka/studia/pdow/summer-23-lab3-6-KunickiKarol
