## Notebook used to create and test the models 

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import mean_absolute_error,mean_squared_error
from ast import literal_eval
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Dropout
from tensorflow.keras.layers import LSTM,Embedding
import numpy as np
import math

In [3]:
POLLUTION = pd.read_csv("data/TextPollution.csv")
POLLUTION.head()

Unnamed: 0.1,Unnamed: 0,headlines,variations
0,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.183673469387755, 1.0337662337662337, 1.1432..."
1,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, ...","[0.5344827586206896, 0.8710217755443886, 0.547..."
2,2,"[0.0, 1.0, 5.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.1935483870967742, 0.8052884615384617, 1.155..."
3,3,"[0.0, 6.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[0.8648648648648649, 0.9253731343283581, 1.144..."
4,4,"[0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[1.75, 1.6806451612903226, 1.5001070893124868,..."


### gets both the features and labels matrices from the dataset and scales the features matrix

In [4]:
def get_matrices():
    
    headlines =  POLLUTION['headlines'].apply(literal_eval)
    
    matrix_headlines = list()
    
    for i in headlines.keys():
        matrix_headlines.append([float(x) for x in headlines[i]])
    
    variations = POLLUTION['variations'].apply(literal_eval)

    matrix_variations = list()
    
    for i in variations.keys():
        matrix_variations.append([float(x) for x in variations[i]])
    
    return  (matrix_headlines,matrix_variations)

def scale_matrix(matrix):
    
    standard_scaler = preprocessing.StandardScaler()
    standard_scaler.fit(matrix)
    matrix = standard_scaler.transform(matrix)
    
    return [list(x) for x in matrix]

headlines,variations = get_matrices()
headlines = scale_matrix(headlines)
POLLUTION['headlines'] = headlines
POLLUTION['variations'] = variations
##print(headlines)

In [5]:
POLLUTION.head()

Unnamed: 0.1,Unnamed: 0,headlines,variations
0,0,"[-0.7538061585358591, -0.9961555380372183, -0....","[1.183673469387755, 1.0337662337662337, 1.1432..."
1,1,"[-0.7538061585358591, -0.34015067152490386, -0...","[0.5344827586206896, 0.8710217755443886, 0.547..."
2,2,"[-0.7538061585358591, -0.34015067152490386, 1....","[1.1935483870967742, 0.8052884615384617, 1.155..."
3,3,"[-0.7538061585358591, 2.9398736610366685, 0.67...","[0.8648648648648649, 0.9253731343283581, 1.144..."
4,4,"[-0.7538061585358591, 0.31585419498741063, -0....","[1.75, 1.6806451612903226, 1.5001070893124868,..."


In [6]:
X = np.array(POLLUTION['headlines'].to_list())
y = np.array(POLLUTION['variations'].to_list())

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
print("elements in training set: "  +  str(len(X_train)))
print("elements in testing  set: " +  str(len(X_test)))

elements in training set: 18
elements in testing  set: 9


In [43]:
def run_tests(y_test,predictions):
    mae = mean_absolute_error(y_test,predictions)
    mse = mean_squared_error(y_test,predictions)
    rmse = math.sqrt(mean_squared_error(y_test,predictions))
    
    print("MEAN ABSOLUTE ERROR: " + str(mae))
    print("MEAN SQUARED ERROR: " + str(mse))
    print("ROOT MEAN SQUARED ERROR: " + str(rmse))
    
    return (mae,mse,rmse)

### Creates the dnn model for a given input and output size

In [32]:
def create_model_dnn(input_size,output_size):
    model = Sequential()
    model.add(Dense(64,input_size))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(output_size,activation='softmax'))
    
    model.compile(optimizer='rmsprop',loss='mse',metrics=['mean_squared_error'])
    
    return model

### Creates a lstm model for a given input and output size 
### Working can be improved

In [36]:
def create_model_lstm(input_size,output_size):
    model = Sequential()
    model.add(Embedding(input_size,output_size))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(output_size,activation='sigmoid'))
    
    model.compile(loss='mse',optimizer='rmsprop',
                  metrics=['mean_squared_error'])
    
    return model

In [17]:
dnn = create_model_dnn(len(X_train[0]),len(y_train[0]))

In [18]:
dnn.fit(X_train,y_train,batch_size=32,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x19d8feeb448>

In [19]:
predictions = dnn.predict(X_test)

In [28]:
run_tests(y_test,predictions)

MEAN ABSOLUTE ERROR: 0.8346067586141983
MEAN SQUARED ERROR: 0.9923563180945505
ROO MEAN SQUARED ERROR0.9961708277672814


(0.8346067586141983, 0.9923563180945505, 0.9961708277672814)

In [38]:
lstm = create_model_lstm(len(X_train[0]),len(y_train[0]))

In [46]:
lstm.fit(X_train,y_train,batch_size=16,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x19d9465ba48>

In [47]:
predictions = lstm.predict(X_test)

In [48]:
run_tests(y_test,predictions)

MEAN ABSOLUTE ERROR: 0.33790203579180805
MEAN SQUARED ERROR: 0.2687145946571488
ROO MEAN SQUARED ERROR: 0.5183768847635365


(0.33790203579180805, 0.2687145946571488, 0.5183768847635365)