In [1]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from re import sub, findall
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer, MinMaxScaler
from datetime import datetime
from sklearn.model_selection import train_test_split
import keras

from keras.layers import Dense, Dropout
from keras import regularizers
from keras import BatchNormalization
import numpy as np

# Mounting file system


In [2]:
drive.mount("gdrive", force_remount=True)

Mounted at gdrive


In [3]:
%cd gdrive/MyDrive

/content/gdrive/MyDrive



# Loading the dataset and doing some exploration


In [4]:
df= pd.read_csv("parkReviews.csv", sep= ",", encoding='latin-1')

In [None]:
# delete rows with missing fields

In [5]:
df.dropna(inplace=True)

In [6]:
df= df[df['Year_Month'] != "missing"]


In [7]:
#common stopwords in english
stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def purify_text(s):
  # Replace HTML tags with spaces
  s=s.lower()
  s = sub(r"<.*?>", " ", s)
 # print(s)
  # Replace all punctuation with spaces
  s = sub(r"[^a-zA-Z0-9')\s]", " ", s)
  s=s.split()[:50]
  s = [word for word in s if word not in stopwords]

  s=' '.join(s)
  return s



In [8]:
# Limit the 'Review_Text' field to have at most 50 words and do purification
df['Review_Text'] = df['Review_Text'].apply(lambda x: purify_text(x))

In [9]:
coun_vect = CountVectorizer(stop_words='english',binary=True, max_df=2, min_df=1)
#max_df=1 means that if a term is present in all reviews, than don't put it in the dict

In [10]:
count_array = coun_vect.fit_transform(df["Review_Text"]).toarray()
count_array.shape


(40043, 12665)

In [11]:
branches = LabelBinarizer().fit_transform(df["Branch"])

In [12]:
ohe = OneHotEncoder()
rev_locs=ohe.fit_transform(df[["Reviewer_Location"]])

# Convert the one-hot encoded data to a dense array
rev_locs = rev_locs.toarray()

# Create a DataFrame from the one-hot encoded array
rev_locs = pd.DataFrame(rev_locs, columns=ohe.categories_[0], dtype=bool)


In [13]:
df["Year_Month"]= df['Year_Month'].apply(lambda x: int(datetime.strptime(x, "%Y-%m").timestamp()) * 1000)

In [14]:
count_array_dataframe= pd.DataFrame(count_array,dtype='bool')

In [15]:
del count_array

In [16]:
count_array_dataframe.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12655,12656,12657,12658,12659,12660,12661,12662,12663,12664
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
count_array_dataframe.shape

(40043, 12665)

In [18]:
del coun_vect

In [19]:
branches= pd.DataFrame(branches)
branches= branches.astype({0:bool,1:bool,2:bool})

In [20]:
count_array_dataframe["Year_Month"]=df["Year_Month"]
count_array_dataframe["branch0"]=branches[0]
count_array_dataframe["branch1"]=branches[1]
count_array_dataframe["branch2"]=branches[2]

In [21]:
count_array_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40043 entries, 0 to 40042
Columns: 12669 entries, 0 to branch2
dtypes: bool(12668), float64(1)
memory usage: 484.1 MB


In [22]:
for col in rev_locs.columns:
  count_array_dataframe[col] = rev_locs[col].values


  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_array_dataframe[col] = rev_locs[col].values
  count_arra

In [23]:
count_array_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Zambia,Zimbabwe,Åland Islands
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40038,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
40039,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
40040,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
40041,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [24]:
count_array_dataframe.shape

(40043, 12831)

In [25]:
#scaling year_month field
print(count_array_dataframe["Year_Month"][3])
scaler= MinMaxScaler()
count_array_dataframe["Year_Month"]= scaler.fit_transform(count_array_dataframe[["Year_Month"]])
print(count_array_dataframe["Year_Month"][3])

1554076800000.0
0.9910394265232974


In [26]:
y= df["Rating"]
y.value_counts() # labels are slightly unbalanced, need to see f1 score

5    21908
4    10086
3     4782
2     1929
1     1338
Name: Rating, dtype: int64

In [27]:
X_train, X_test, y_train, y_test = train_test_split(count_array_dataframe, y, test_size=0.33, random_state=42)

In [28]:
del df

In [None]:
X_train[0]

23943    False
16584    False
21968    False
4513     False
4596     False
         ...  
6265     False
11284    False
38158     True
860      False
15795    False
Name: 0, Length: 26828, dtype: bool

In [29]:
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)


In [33]:
#time for keras

model = keras.Sequential(
    [
         Dense(input_shape=(X_train.shape[1],),units=64, activation='relu', kernel_regularizer= regularizers.L1(0.01),
       # Dense(input_shape=(X_train.shape[1],),units=32, activation='relu', kernel_regularizer= regularizers.L1(0.01),
              kernel_initializer= keras.initializers.HeUniform()),
        keras.layers.Dropout(.2,),
        Dense(units=64, activation='relu', kernel_initializer= keras.initializers.HeUniform()),
        keras.layers.Dropout(.3, ),
        Dense(units=32, activation='sigmoid',kernel_initializer= keras.initializers.HeUniform() ),

        Dense(units=6, activation="softmax"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=10**-1, clipvalue=0.5),
                loss=keras.losses.SparseCategoricalCrossentropy(
    from_logits=False,
    ignore_class=None,
    reduction="auto",
    name="sparse_categorical_crossentropy",
),
                metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 64)                821248    
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 64)                4160      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 32)                2080      
                                                                 
 dense_11 (Dense)            (None, 6)                 198       
                                                                 
Total params: 827,686
Trainable params: 827,686
Non-tr

In [48]:
# model with residual block:
from keras.layers import Input,Add
from keras.models import Model


input_layer = Input(shape=(X_train.shape[1],))
#dropout_input = Dropout(0.2)(input_layer)
hidden_layer1 = Dense(units=64, activation='relu', kernel_regularizer= regularizers.L1(0.01))(input_layer)
#dropout1 = Dropout(0.4)(hidden_layer1)
#skip_connection = Add()([hidden_layer1, dropout1])
hidden_layer2 = Dense(units=64, activation='relu', kernel_regularizer= regularizers.L1(0.01))(hidden_layer1)
#dropout2 = Dropout(0.2)(hidden_layer2)
skip_connection = Add()([hidden_layer2, hidden_layer1])
output_layer = Dense(6, activation='softmax')(skip_connection)


model = Model(inputs=input_layer, outputs=output_layer)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [50]:
epochs = 100
batch_size = 30
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,validation_split=0.2,
          shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100

KeyboardInterrupt: ignored