<img src="Pictures/sec2_2.jpg" width="1500" height = "300">

In [1]:
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import tensorflow as tf # deep learning
from tensorflow.keras.models import Sequential # deep learning
from tensorflow.keras.layers import Dense, Dropout, LSTM, Activation # deep learning
from nltk.corpus import stopwords # natural language processing
from pyspark.sql import functions # data processing
from pyspark.ml.feature import StopWordsRemover # natural language processing
from keras.preprocessing.text import Tokenizer # natural language processing
import pyspark.pandas as ps # data processing
from nltk.stem import PorterStemmer # natural language processing
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D # deep learning
from nltk import pos_tag #Tags words with their parts of speech
from nltk.corpus import stopwords #Contains a list of stopwords
from nltk.corpus import wordnet #Contains a list of wordnet words
from keras_preprocessing.sequence import pad_sequences
from sklearn.base import BaseEstimator, TransformerMixin #Base class for transformers
from sklearn.pipeline import make_pipeline #Used to create a pipeline
import nbformat #Used to read the notebook
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer #Counts the number of times a word appears in a document
from nltk.stem import WordNetLemmatizer #Let's you lemmatize words
from nltk.corpus import wordnet #Contains a list of wordnet words



In [7]:
def sentiment_rating(rating):
    # Replacing ratings of 4,5 with 1 (good) and 1,2 with 0 (not good)
    if(float(rating) > 6):
        return 1
    else: 
        return 0

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text): #Lemmatize the words
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            pos = pos_tag([i.strip()])
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

class Lemmatize(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['extract'] = X['extract'].apply(lemmatize_words)
        return X

class removeStopWords(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
        return X

class StemTheWords(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):       
        X['extract'] = X['extract'].apply(lambda x: " ".join(PorterStemmer().stem(word) for word in x.split()))
        return X
    
class dropTheNullValues(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.dropna()
    
class getRelevantColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[['extract', 'score']]
    
class returnXAndY(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.iloc[:,0],X.iloc[:,1]
       

class convertYtoBinary(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['score'] = X['score'].apply(sentiment_rating)
        return X


class makeItLowerCase(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['extract'] = X['extract'].str.lower()        
        return X
    
class replaceHTMLelements(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['extract'] = X['extract'].str.replace('<.*?>', '')       
        return X
    
class onlyTakeEnglishRecords(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X[X['lang'] == 'en']        
        return X

class convertObjectColumnsToStringColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        string_col = X.select_dtypes(include="object").columns
        X[string_col] = X[string_col].astype("string")
        return X

In [10]:
dataframe = pd.read_csv('Phone Reviews/phone_user_review_file_2.csv',encoding='latin-1')
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114925 entries, 0 to 114924
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   phone_url  114925 non-null  object 
 1   date       114925 non-null  object 
 2   lang       114925 non-null  object 
 3   country    114925 non-null  object 
 4   source     114925 non-null  object 
 5   domain     114925 non-null  object 
 6   score      112166 non-null  float64
 7   score_max  112166 non-null  float64
 8   extract    113965 non-null  object 
 9   author     113290 non-null  object 
 10  product    114925 non-null  object 
dtypes: float64(2), object(9)
memory usage: 9.6+ MB


In [17]:
files = os.listdir('Phone Reviews') # get all the files in the directory
print(files) # print the files

'''
------------------------------------------------------------------------------------------------------------------------
Loads dataframes
------------------------------------------------------------------------------------------------------------------------
'''
dataframes = [] # create an empty list
for file in files: # loop through the files
    if file.endswith('.csv'): # if the file ends with .csv, but exclude the first file which we already loaded
        dataframes.append(pd.read_csv('Phone Reviews/' + file, encoding='latin-1')) # We set the encoding to latin-1 because the file is encoded in latin-1

'''
------------------------------------------------------------------------------------------------------------------------
Cleans the dataframes.
------------------------------------------------------------------------------------------------------------------------ 
'''
processingPipeline_stem = make_pipeline(onlyTakeEnglishRecords(), getRelevantColumns(), convertYtoBinary(), makeItLowerCase(), replaceHTMLelements(), dropTheNullValues(), removeStopWords(), StemTheWords())
processingPipeline_lem = make_pipeline(onlyTakeEnglishRecords(), getRelevantColumns(), convertYtoBinary(), makeItLowerCase(), replaceHTMLelements(), dropTheNullValues(), removeStopWords(),Lemmatize())

betterFrames_stem = []
betterFrames_lem = []
count = 1
for frame in dataframes:
    betterFrames_lem.append(processingPipeline_lem.fit_transform(frame))
    betterFrames_stem.append(processingPipeline_stem.fit_transform(frame))
    print("Done with " + str(count))
    count += 1
    
'''
------------------------------------------------------------------------------------------------------------------------
Convert pandas to numpy arrays.
------------------------------------------------------------------------------------------------------------------------
'''
numpyFrames_stem = []
numpyFrames_lem = []
for frame in betterFrames_stem:
    numpyFrames_stem.append(frame.to_numpy())

for frame in betterFrames_lem:
    numpyFrames_lem.append(frame.to_numpy())

'''
------------------------------------------------------------------------------------------------------------------------
Concatenate the numpy arrays.
------------------------------------------------------------------------------------------------------------------------
'''
masterArray_stem = np.concatenate((numpyFrames_stem), axis=0)
masterArray_lem = np.concatenate((numpyFrames_lem), axis=0)

X_stem = masterArray_stem[:,0]
y_stem = masterArray_stem[:,1]

X_lem = masterArray_lem[:,0]
y_lem = masterArray_lem[:,1]

print(X_stem.shape)
print(y_stem.shape)

'''
------------------------------------------------------------------------------------------------------------------------
Save the arrays.
------------------------------------------------------------------------------------------------------------------------
'''
np.save('Clean Numpy Arrays New/X_stem.npy', X_stem)
np.save('Clean Numpy Arrays New/y_stem.npy', y_stem)

np.save('Clean Numpy Arrays New/X_lem', X_lem)
np.save('Clean Numpy Arrays New/y_lem', y_lem)

['phone_user_review_file_1.csv', 'phone_user_review_file_2.csv', 'phone_user_review_file_3.csv', 'phone_user_review_file_4.csv', 'phone_user_review_file_5.csv', 'phone_user_review_file_6.csv']


  X['extract'] = X['extract'].str.replace('<.*?>', '')
  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: " ".join(PorterStemmer().stem(word) for word in x.split()))
  X['extract'] = X['extract'].str.replace('<.*?>', '')


Done with 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lemmatize_words)
  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Done with 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lemmatize_words)
  X['extract'] = X['extract'].str.replace('<.*?>', '')


Done with 3


  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lemmatize_words)
  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/sta

Done with 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lemmatize_words)
  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Done with 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['extract'] = X['extract'].apply(lemmatize_words)
  X['extract'] = X['extract'].str.replace('<.*?>', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Done with 6
(550531,)
(550531,)
