## Todo 04/12/2022

1. Need to figure out a way to account for words that have never been seen before
    * So we need to figure out a way to bin words that are similar to each other
    * here is what we can do: https://www.analyticsvidhya.com/blog/2020/05/what-is-tokenization-nlp/
    * https://neptune.ai/blog/wasserstein-distance-and-textual-similarity
    * Remove stop words (preprocessing)
    * 

__Imports__

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
#nltk.download()

__Read in Data__

In [2]:
df_1 = pd.read_csv("data/train.csv")
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [3]:
df_1.describe()

Unnamed: 0,score
count,36473.0
mean,0.362062
std,0.258335
min,0.0
25%,0.25
50%,0.25
75%,0.5
max,1.0


In [4]:
df_1.isnull().any()

id         False
anchor     False
target     False
context    False
score      False
dtype: bool

__This is the amount of words in each row with respect to the column__
* Must be applied to anchor and target

These are the unique amount of words in __anchor__ column

In [5]:
np.unique(np.array(df_1['anchor'].apply(lambda x : len(x.split()))))

array([1, 2, 3, 4, 5], dtype=int64)

These are the unique amount of words in __target__ column

In [6]:
np.unique(np.array(df_1['target'].apply(lambda x : len(x.split()))))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15],
      dtype=int64)

We can probably drop __context__ because its a label

In [7]:
np.unique(np.array(df_1['context'].apply(lambda x : len(x.split()))))

array([1], dtype=int64)

__Tokenization__

In [8]:
from nltk.tokenize import word_tokenize
df_1['target'] = df_1['target'].apply(lambda x : word_tokenize(x))
df_1['anchor'] = df_1['anchor'].apply(lambda x : word_tokenize(x))

In [9]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, of, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, of, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stopwords Removal__

In [10]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df_1['target'] = df_1['target'].apply(lambda x : [w for w in x if not w in stop_words])
df_1['anchor'] = df_1['anchor'].apply(lambda x : [w for w in x if not w in stop_words])

In [11]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stemming__

In [12]:
from nltk.stem.snowball import SnowballStemmer
snowBallStemmer = SnowballStemmer("english")
df_1['target'] = df_1['target'].apply(lambda x : [snowBallStemmer.stem(word) for word in x])
df_1['anchor'] = df_1['anchor'].apply(lambda x: [snowBallStemmer.stem(word) for word in x])

In [13]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abat],"[abat, pollut]",A47,0.5
1,7b9652b17b68b7a4,[abat],"[act, abat]",A47,0.75
2,36d72442aefd8232,[abat],"[activ, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abat],"[elimin, process]",A47,0.5
4,54c1e3b9184cb5b6,[abat],"[forest, region]",A47,0.0


__Need to convert array to just strings__

In [14]:
df_1['anchor'] = df_1['anchor'].apply(lambda x : ','.join(map(str,x)))
df_1['target'] = df_1['target'].apply(lambda x : ','.join(map(str,x)))

In [15]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abat,"abat,pollut",A47,0.5
1,7b9652b17b68b7a4,abat,"act,abat",A47,0.75
2,36d72442aefd8232,abat,"activ,catalyst",A47,0.25
3,5296b0c19e1ce60e,abat,"elimin,process",A47,0.5
4,54c1e3b9184cb5b6,abat,"forest,region",A47,0.0


These the the unique words in the __anchor__ column 10 are shown below

In [16]:
np.unique(np.array(df_1['anchor']))[:10]

array(['abat', 'abnorm,posit', 'absorb,properti', 'acan', 'accept,inform',
       'achiev,authent', 'acid,absorpt', 'ack', 'acoustoopt,modul',
       'acryl,group'], dtype=object)

These the the unique words in the __target__ column 10 are shown below

In [17]:
np.unique(np.array(df_1['target']))[:10]

array(['', '1,amino,2,methoxyethan', '1,azabicyclo', '1,bromopropan',
       '1,methoxi,2,aminoethan', '10,bit,dac',
       '111444,hexafluoro,2,buten', '12,dibromopropan',
       '120,degre,interv', '12cao7al2o3'], dtype=object)

We have the issue of words going together, we need to seperate these so we can
we need to seperate the lengths

In [18]:
anchor_set = set()
target_set = set()

In [19]:
def separator_anchor(arr):
    for i in arr:
        anchor_set.add(i)

In [20]:
def separator_target(arr):
    for i in arr:
        target_set.add(i)

In [21]:
df_1['anchor'] = df_1['anchor'].apply(lambda x: x.split(','))
df_1['target'] = df_1['target'].apply(lambda x: x.split(','))

In [22]:
df_1['anchor'].apply(lambda x: separator_anchor(x))
df_1['target'].apply(lambda x: separator_target(x))

0        None
1        None
2        None
3        None
4        None
         ... 
36468    None
36469    None
36470    None
36471    None
36472    None
Name: target, Length: 36473, dtype: object

__list comprehension__

In [23]:
anchor_arr = [i for i in anchor_set]
target_arr = [i for i in target_set]

__Label Encoding__

In [24]:
labelencoder_anchor = LabelEncoder()
labelencoder_anchor.fit(anchor_arr)
labelencoder_target = LabelEncoder()
labelencoder_target.fit(target_arr)

LabelEncoder()

In [25]:
def encoding_anchor(arr):
    temp = []
    if len(arr) < 2:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            return labelencoder_anchor.transform([i])
    if len(arr) > 1:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            temp.extend(labelencoder_anchor.transform([i]))
        return temp

In [26]:
def encoding_target(arr):
    temp = []
    if len(arr) < 2:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            return labelencoder_target.transform([i])
    if len(arr) > 1:
        for i in arr:
            #print(labelencoder_anchor.transform([i]))
            temp.extend(labelencoder_target.transform([i]))
        return temp

In [27]:
df_1['anchor'] = df_1['anchor'].apply(lambda x: encoding_anchor(x))

In [28]:
df_1['target'] = df_1['target'].apply(lambda x: encoding_target(x))

In [29]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[0],"[35, 4113]",A47,0.5
1,7b9652b17b68b7a4,[0],"[84, 35]",A47,0.75
2,36d72442aefd8232,[0],"[86, 852]",A47,0.25
3,5296b0c19e1ce60e,[0],"[1798, 4260]",A47,0.5
4,54c1e3b9184cb5b6,[0],"[2177, 4500]",A47,0.0


In [30]:
# count_vect = CountVectorizer(analyzer='word')
# corpus =  np.array(df_1['target'])[0]
# X_counts = count_vect.fit_transform(corpus)
# df_1['target'].apply(lambda x: count_vect.fit_transform(x).toarray()[0])
# print(X_counts.shape)
# print(count_vect.get_feature_names())
# print(X_counts.toarray()[0])
# import pandas as pd
# X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())
# X_counts_df.head(10)

__Label Encoder__

In [31]:
# labelencoder_anchor = LabelEncoder()
# labelencoder_target = LabelEncoder()
# labelencoder_context = LabelEncoder()
# df_1['anchor'] = labelencoder_anchor.fit_transform(df_1['anchor'])
# df_1['target'] = labelencoder_target.fit_transform(df_1['target'])
# df_1['context'] = labelencoder_context.fit_transform(df_1['context'])

__Label Encoder Checkpoint__

In [32]:
# df_1.head()

__Preparing test__
* Will be used later for the competition

In [33]:
# test = pd.read_csv("test.csv")
# test['anchor'] = labelencoder_anchor.transform(test['anchor'])
# test['target'] = labelencoder_target.transform(test['target'])
# test['context'] = labelencoder_context.transform(test['context'])
# test.head()
# test_id = np.array(test['id'])
# test.drop('id',axis=1,inplace=True)
# x_test = test

In [34]:
# df_1['merger'] = df_1['anchor'] + ","+ df_1['target']

__Spliting (70/30)__
* Here we test/evaluate our models

In [35]:
# x = df_1.drop(['id','score'],axis=1).values
x = df_1[['anchor','target']]
y = df_1['score']

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=141)

In [37]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# count_vect = CountVectorizer()
# tf_transformer = TfidfTransformer(use_idf=False)
# x_train_counts = count_vect.fit_transform(x_train)
# x_train_tf = tf_transformer.fit_transform(x_train_counts)
#df_1['target'] = df_1['target'].apply(lambda x: ["".join(tokens) for tokens in x ])

#X_counts = count_vect.fit_transform(corpus)
# print(X_counts.shape)
# print(count_vect.get_feature_names())
# print(X_counts.toarray())

__Random Forest Regressor__

In [41]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[0],"[35, 4113]",A47,0.5
1,7b9652b17b68b7a4,[0],"[84, 35]",A47,0.75
2,36d72442aefd8232,[0],"[86, 852]",A47,0.25
3,5296b0c19e1ce60e,[0],"[1798, 4260]",A47,0.5
4,54c1e3b9184cb5b6,[0],"[2177, 4500]",A47,0.0


In [42]:
df_1.to_csv("data_file.csv",index=None)

In [44]:
df_1 = pd.read_csv("data_file.csv")

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=141)
regr.fit(x_train, y_train)
y_pred = np.round(regr.predict(x_test),decimals=2)
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

__Feature Importance__
* This is based on the gini index from the random forest model

In [39]:
# features = [df_1.columns[i] for i in range(1,3)]
# feature_importance_nums = regr.feature_importances_
# feature_importance_df = pd.DataFrame(feature_importance_nums).transpose()
# feature_importance_df.columns = features
# feature_importance_df.head()

ERROR! Session/line number was not unique in database. History logging moved to new session 305


__Lasso Regression__

In [None]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.5)
lr.fit(x_train, y_train)
y_pred = np.round(lr.predict(x_test),decimals=2)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

__Prediction Submission__
* Will be used later for competition

In [40]:
# pd.read_csv("sample_submission.csv").head()

# df_final.head()

# df_final.to_csv("Michael_Woo_Submission.csv",index=False)

# pd.read_csv("Michael_Woo_Submission.csv")

ERROR! Session/line number was not unique in database. History logging moved to new session 306
