In [None]:
import os
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from chardet import detect
from IPython.display import display
from scipy.stats import zscore
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy import stats
import pylab as pl

from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

from sklearn import model_selection
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import seaborn as sns

In [None]:
os.listdir()

In [None]:
np.random.seed(42)

# Read data

In [None]:
train_filepath = 'data/train.csv'
test_filepath = 'data/test.csv'

def get_encoding(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']

data_train = pd.read_csv(train_filepath, sep=",", encoding = get_encoding(train_filepath))
data_test = pd.read_csv(test_filepath, sep=",", encoding = get_encoding(test_filepath))

# Data explore + clean

In [None]:
print(data_train.info())

In [None]:
display(data_train)

## Correlation

In [None]:
df_temp = data_train.copy()

In [None]:
# Convert string column to numeric

print(df_temp['B'].unique())
print(len(df_temp['B'].unique()))
df_temp['B_numeric'] = df_temp['B'].astype("category").cat.codes

In [None]:
df_temp.hist(bins=30, figsize=(12,12))
pl.suptitle("Histogram for each numeric input variable")
plt.show()

In [None]:
df_temp = df_temp.dropna()

feature_names = ['A', 'NN', 'B_numeric']
X = df_temp[feature_names]
y = df_temp['P']
cmap = cm.get_cmap('gnuplot')
scatter = scatter_matrix(X, c = y, s=40, figsize=(12,12))
plt.suptitle('Scatter-matrix')

## Convert NN column to range classes

In [None]:
data_train['NN'].plot(kind='box', subplots=True, sharex=False, sharey=False, figsize=(10,10), title='NNs')
plt.show()

In [None]:
print(max(data_train['NN']))
print(min(data_train['NN']))
plt.figure(figsize=(20,8))
plt.plot(data_train['NN'])
plt.show()

In [None]:
# Remove rows where 'NN' is NA

print(len(data_train))
data_train = data_train.dropna(subset=['NN'])
print(len(data_train))

In [None]:
# print(len(data_train[data_train['NN'] < 0.01]) + len(data_train[data_train['NN'] > 100]))

# print(len(data_train))
# data_train = data_train[data_train['NN'] >= 0.01]
# print(len(data_train))
# data_train = data_train[data_train['NN'] <= 100]
# print(len(data_train))

In [None]:
# Remove 'NN' outliers

print(len(data_train))

z_scores = stats.zscore(data_train['NN'])
abs_z_scores = np.abs(z_scores)
filtered_entries = abs_z_scores < 2

data_train = data_train[filtered_entries]

print(len(data_train))

In [None]:
data_train['NN'].hist()

In [None]:
data_train['NN'].plot(kind='box', subplots=True, sharex=False, sharey=False, figsize=(10,10), title='NNs')
plt.show()

In [None]:
print(max(data_train['NN']))
print(min(data_train['NN']))
plt.figure(figsize=(20,8))
plt.plot(data_train['NN'])
plt.show()

In [None]:
data_train['NN'].plot(kind='box', subplots=True, sharex=False, sharey=False, figsize=(10,10), title='NNs')
plt.show()

In [None]:
print(data_train['NN'].quantile(.05))
print(data_train['NN'].quantile(.95))

print(data_train['NN'].quantile(.10))
print(data_train['NN'].quantile(.90))

In [None]:
def calculate_NN_range_class(NN):    
    class_label = ""
    if(NN < 1):
        class_label = "class0"
    elif(NN > 68):
        class_label = "class24"
    else:
        res = str(int(NN) // 3 + 1)
        class_label = "class" + res
    return class_label

data_train['NN_range_class'] = data_train["NN"].apply(calculate_NN_range_class)

In [None]:
# # Test
# aa = range(-5,80)
# for a in aa:
#     print(calculate_NN_range_class(a), a)

In [None]:
display(data_train)

In [None]:
# Test data cell ###############################

data_test = data_test.replace(np.nan, '', regex=True)

print("Total:", len(data_test["NN"]))
print("Empty NN cases:", len(data_test[data_test["NN"] == ""]))

aa = data_test[data_test["NN"] != ""]
mean_NN = aa['NN'].mean()
print("Mean:", mean_NN)

aa['NN'].plot(kind='box', subplots=True, sharex=False, sharey=False, figsize=(10,10))
plt.show()

In [None]:
def convert_value(value):
    if(value == ""):
        return mean_NN
    else:
        return value

In [None]:
data_test['NN'] = data_test["NN"].apply(convert_value)

In [None]:
# Test data cell ###############################

data_test['NN_range_class'] = data_test["NN"].apply(calculate_NN_range_class)

## Drop some columns

In [None]:
print(len(data_train.keys()))
data_train = data_train.drop(columns=['A', 'D', 'F', 'H', 'I', 'J', 'M', 'NN', 'O'])
print(len(data_train.keys()))

In [None]:
display(data_train)

In [None]:
# Test data cell ###############################
print(data_test.keys())
print(len(data_test.keys()))
data_test = data_test.drop(columns=['D', 'F', 'H', 'I', 'J', 'M', 'NN'])
print(len(data_test.keys()))

## Fix rows with missing data

In [None]:
# display(data_train[data_train['P']==7])
display(data_train[data_train['P']!=1])

In [None]:
# print(data_train[data_train['P']==7].info())
data_train[data_train['P']!=1].info()
data_train[data_train['P']==1].info()
# data_train.info()

In [None]:
# In each case except P==1 convert NAN occurences in K to empty string ''

data_train.loc[data_train['P']!=1,'K'] = data_train.loc[data_train['P']!=1,'K'].replace(np.nan, '', regex=True)

In [None]:
# print(data_train[data_train['P']==7].info())
data_train[data_train['P']!=1].info()
data_train[data_train['P']==1].info()
# data_train.info()

In [None]:
# Check
data_train.loc[(data_train['P']!=1) & (data_train['K']==""),'K'].shape

In [None]:
print(data_train[data_train['P']!=1].info())
print(data_train[data_train['P']==1].info())

data_train = data_train.dropna(subset=['L', 'K'])

print(data_train[data_train['P']!=1].info())
print(data_train[data_train['P']==1].info())

In [None]:
display(data_train)

In [None]:
def edit_text(text):
    return text.replace("_"," ").replace("~"," ").replace("[","").replace("]","").replace("\"","").replace(","," ").replace("*"," ").replace("/"," ").replace("\\"," ").replace(":","").replace("&"," ")

In [None]:
data_train["E"] = data_train["E"].apply(edit_text)
data_train["G"] = data_train["G"].apply(edit_text)
data_train["K"] = data_train["K"].apply(edit_text)
data_train["L"] = data_train["L"].apply(edit_text)

In [None]:
display(data_train)

In [None]:
# Test data cell ###############################

data_test["E"] = data_test["E"].apply(edit_text)
data_test["G"] = data_test["G"].apply(edit_text)
data_test["K"] = data_test["K"].apply(edit_text)
data_test["L"] = data_test["L"].apply(edit_text)

## Imbalance fix

In [None]:
print(data_train['P'].value_counts())

In [None]:
data_train['P'].hist(figsize=(12,8), bins=len(data_train['P'].unique()), grid=False)

In [None]:
data_train_cleaned_1 = data_train[data_train['P'] != 1]

data_train_cleaned_1['P'].hist(figsize=(12,8), bins=len(data_train['P'].unique())-1, grid=False)

### Imbalance fix 1

In [None]:
# # Second csv result generated with this

# l='P'

# g = df.groupby(l, group_keys=False)
# balanced_df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()))).reset_index(drop=True)

# print(balanced_df['P'].value_counts())

In [None]:
# data_train = balanced_df

### Imbalance fix 2

In [None]:
# Third csv result generated with this

# Average occurrences (rounded value) in all classes except "class 1"
avg_items_except_c1 = int(data_train[data_train['P']!=1]['P'].value_counts().mean())

# 'Imbalance fix 2' was real avg - 108
# avg_items_except_c1 = 150 # 'Imbalance fix 3'
# avg_items_except_c1 = 131 # 'Imbalance fix 4'
# avg_items_except_c1 = 123 # 'Imbalance fix 5'
avg_items_except_c1 = 165 # 'Imbalance fix 7'

print(avg_items_except_c1)

In [None]:
df_c1 = data_train[data_train['P']==1].sample(n=avg_items_except_c1, random_state=42)
df_c4 = data_train[data_train['P']==4].sample(n=avg_items_except_c1, random_state=42)
df_c3 = data_train[data_train['P']==3].sample(n=avg_items_except_c1, random_state=42)
# df_c1 = data_train[data_train['P']==1].sample(n=avg_items_except_c1+15, random_state=42)
# df_c4 = data_train[data_train['P']==4].sample(n=avg_items_except_c1+15, random_state=42)
# df_c3 = data_train[data_train['P']==3].sample(n=avg_items_except_c1+15, random_state=42)

df_with_less_rows = data_train[(data_train['P']==2) | (data_train['P']==5) | (data_train['P']==6) | (data_train['P']==7)]
print(df_with_less_rows['P'].value_counts())

In [None]:
lst = [df_with_less_rows]
for class_index, group in df_with_less_rows.groupby('P'):
    print(len(group))
    lst.append(group.sample(avg_items_except_c1-len(group), replace=True))
df_c2_c5_c6_c7 = pd.concat(lst)

In [None]:
frames = [df_c1, df_c4, df_c3, df_c2_c5_c6_c7]
result = pd.concat(frames)

In [None]:
result['P'].value_counts()

In [None]:
data_train = result
data_train = data_train.reset_index()
data_train.drop(columns=['index'])

## Create concatenated column of other columns

In [None]:
data_train.keys()

In [None]:
data_train['all_fields'] = data_train['B'] + " " + data_train['C'] + " " + data_train['E'] + " " + data_train['G'] + " " + data_train['K'] + " " + data_train['L'] + " " + data_train['NN_range_class']

In [None]:
display(data_train)

In [None]:
data_train['all_fields']

In [None]:
data_train['all_fields'][2]

In [None]:
# Test data cell ###############################
data_test.keys()

In [None]:
# Test data cell ###############################
data_test['all_fields'] = data_test['B'] + " " + data_test['C'] + " " + data_test['E'] + " " + data_test['G'] + " " + data_test['K'] + " " + data_test['L'] + " " + data_test['NN_range_class']

# Prep test dataset

In [None]:
print(data_test.shape)
print()
print(data_test.info())
print()
display(data_test)

# Tokenization

In [None]:
data_train['splitted_sentence'] = [word_tokenize(sentence) for sentence in data_train['all_fields']]

In [None]:
print(data_train['splitted_sentence'][0])
print()
print(data_train['splitted_sentence'])

In [None]:
# Test data cell ###############################
data_test['splitted_sentence'] = [word_tokenize(sentence) for sentence in data_test['all_fields']]

In [None]:
# Test data cell ###############################
print(data_test['splitted_sentence'][0])
print()
print(data_test['splitted_sentence'])

# Lemmatisation

In [None]:
# Lemmatisation

# # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
# tag_map = defaultdict(lambda : wn.NOUN)
# tag_map['J'] = wn.ADJ
# tag_map['V'] = wn.VERB
# tag_map['R'] = wn.ADV
# for index, sentence in enumerate(data_train['splitted_sentence']):
#     # Declaring Empty List to store the words that follow the rules for this step
#     Final_words = []
#     # Initializing WordNetLemmatizer()
#     word_Lemmatized = WordNetLemmatizer()
#     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
#     for word, tag in pos_tag(sentence):
#         # if word not in stopwords.words('english') and word.isalpha():
#         word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
#         Final_words.append(word_Final)
#     data_train.loc[index,'tokens_temp_lemmatisat'] = str(Final_words)

#########################################

data_train['tokens'] = data_train['splitted_sentence'].astype(str)

In [None]:
display(data_train)

In [None]:
# Test data cell ###############################
data_test['tokens'] = data_test['splitted_sentence'].astype(str)
display(data_test)

# Data split

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data_train['tokens'],data_train['P'],test_size=0.3, random_state=42, shuffle=True)

# Vectorization

In [None]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_trf')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
#     mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word.lower_ for word in mytokens ]

    # Removing stop words
    # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    mytokens = [ word for word in mytokens if word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
# vectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0)
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2), min_df=1, max_df=1.0)

vectorizer.fit(data_train['tokens'])

Train_X_vectorized = vectorizer.transform(Train_X)
Test_X_vectorized = vectorizer.transform(Test_X)

In [None]:
print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))

# Training

In [None]:
def conf_matrix(Test_Y, pred_test):    

    con_mat = confusion_matrix(Test_Y, pred_test)
    con_mat = pd.DataFrame(con_mat, range(1,8), range(1,8))
   
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

## Logistic regression (solver='newton-cg')

In [None]:
lr_classifier = LogisticRegression(solver='newton-cg')

lr_classifier.fit(Train_X_vectorized, Train_Y)

# Predicting on the test data
pred_lr1 = lr_classifier.predict(Test_X_vectorized)

In [None]:
f1_test = f1_score(Test_Y, pred_lr1, average='macro') # 'macro' - calculate F1 for each label and find their unweighted mean
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, pred_lr1)

## Logistic regression (solver='newton-cg', class_weight='balanced')

In [None]:
lr2 = LogisticRegression(solver='newton-cg', class_weight='balanced')

lr2.fit(Train_X_vectorized, Train_Y)

# Predicting on the test data
pred_lr2 = lr2.predict(Test_X_vectorized)

In [None]:
f1_test = f1_score(Test_Y, pred_lr2, average='macro')
print('The f1 score for the testing data:', f1_test)

conf_matrix(Test_Y, pred_lr2)

## Naive Bayes

In [None]:
# Naive Bayes classifier
classifier = naive_bayes.MultinomialNB()
classifier.fit(Train_X_vectorized, Train_Y)
prediction_NB = classifier.predict(Test_X_vectorized)
print("Naive Bayes Accuracy Score: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_NB, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_NB)

## SVM

In [None]:
# SVM classifier
svm_classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_classifier.fit(Train_X_vectorized,Train_Y)
prediction_SVM = svm_classifier.predict(Test_X_vectorized)
print("SVM Accuracy Score: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_SVM, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_SVM)

## Logistic Regression (solver='lbfgs)

In [None]:
# Logistic Regression classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=2000)
classifier.fit(Train_X_vectorized,Train_Y)
prediction_lr3 = classifier.predict(Test_X_vectorized)
print("Logistic Regression Accuracy Score: {}%".format(round(accuracy_score(prediction_lr3, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_lr3, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_lr3)

## Logistic Regression (solver='lbfgs', class_weight='balanced')

In [None]:
# Logistic Regression classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=2000, class_weight='balanced')
classifier.fit(Train_X_vectorized,Train_Y)
prediction_lr4 = classifier.predict(Test_X_vectorized)
print("Logistic Regression Accuracy Score: {}%".format(round(accuracy_score(prediction_lr4, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_lr4, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_lr4)

## Random Forest

In [None]:
# Random Forest
rf_classifier = RandomForestClassifier(n_estimators=510, random_state=42, verbose=1) # Add verbose=3 (more than 1) to see progress
rf_classifier.fit(Train_X_vectorized,Train_Y)
prediction_randomforest = rf_classifier.predict(Test_X_vectorized)
print("Random Forest Accuracy Score: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_randomforest, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_randomforest)

## Random Forest (class_weight='balanced')

In [None]:
# Random Forest
classifier = RandomForestClassifier(n_estimators=510, random_state=42, verbose=1, class_weight='balanced') # Add verbose=3 (more than 1) to see progress
classifier.fit(Train_X_vectorized,Train_Y)
prediction_randomforest2 = classifier.predict(Test_X_vectorized)
print("Random Forest Accuracy Score: {}%".format(round(accuracy_score(prediction_randomforest2, Test_Y)*100, 2)))

In [None]:
f1_test = f1_score(Test_Y, prediction_randomforest2, average='macro')
print('The f1 score for the testing data:', f1_test)
conf_matrix(Test_Y, prediction_randomforest2)

# Accuracies

In [None]:
print("Naive Bayes: {}%".format(round(accuracy_score(prediction_NB, Test_Y)*100, 2)))
print()
print("SVM: {}%".format(round(accuracy_score(prediction_SVM, Test_Y)*100, 2)))
print(confusion_matrix(Test_Y, prediction_SVM))
print()
print("Logistic Regression: {}%".format(round(accuracy_score(prediction_lr4, Test_Y)*100, 2)))
print("Random Forest: {}%".format(round(accuracy_score(prediction_randomforest, Test_Y)*100, 2)))

# Predict

In [None]:
# 7
gt_test_X_vectorized = vectorizer.transform(data_test['tokens'])

print(gt_test_X_vectorized.shape)

prediction_randomforest = rf_classifier.predict(gt_test_X_vectorized)

print(prediction_randomforest.shape)

In [None]:
df = pd.DataFrame(data_test['A'])

In [None]:
df['P'] = pd.DataFrame(prediction_randomforest, columns = ['P'])

In [None]:
df

In [None]:
# df.to_csv(r'res_7_/res_7.csv',index=False)

In [None]:
os.listdir()

# Random

In [None]:
df_4fun = pd.DataFrame(data_test['A'])

In [None]:
df_4fun['P'] = np.random.randint(1, 7, df_4fun.shape[0])

In [None]:
df_4fun

In [None]:
# df.to_csv(r'res/res_3_df4fun.csv',index=False)