## Todo 04/12/2022

1. Need to figure out a way to account for words that have never been seen before
    * So we need to figure out a way to bin words that are similar to each other
    * here is what we can do: https://www.analyticsvidhya.com/blog/2020/05/what-is-tokenization-nlp/
    * https://neptune.ai/blog/wasserstein-distance-and-textual-similarity
    * Remove stop words (preprocessing)
    * 

__Imports__

In [230]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
#nltk.download()

__Read in Data__

In [231]:
df_1 = pd.read_csv("data/train.csv")
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [232]:
df_1.describe()

Unnamed: 0,score
count,36473.0
mean,0.362062
std,0.258335
min,0.0
25%,0.25
50%,0.25
75%,0.5
max,1.0


In [233]:
df_1.isnull().any()

id         False
anchor     False
target     False
context    False
score      False
dtype: bool

__This is the amount of words in each row with respect to the column__
* Must be applied to anchor and target

These are the unique amount of words in __anchor__ column

In [234]:
np.unique(np.array(df_1['anchor'].apply(lambda x : len(x.split()))))

array([1, 2, 3, 4, 5], dtype=int64)

These are the unique amount of words in __target__ column

In [235]:
np.unique(np.array(df_1['target'].apply(lambda x : len(x.split()))))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15],
      dtype=int64)

We can probably drop __context__ because its a label

In [236]:
np.unique(np.array(df_1['context'].apply(lambda x : len(x.split()))))

array([1], dtype=int64)

__Tokenization__

In [237]:
from nltk.tokenize import word_tokenize
df_1['target'] = df_1['target'].apply(lambda x : word_tokenize(x))
df_1['anchor'] = df_1['anchor'].apply(lambda x : word_tokenize(x))

In [238]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, of, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, of, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stopwords Removal__

In [239]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df_1['target'] = df_1['target'].apply(lambda x : [w for w in x if not w in stop_words])
df_1['anchor'] = df_1['anchor'].apply(lambda x : [w for w in x if not w in stop_words])

In [240]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abatement],"[abatement, pollution]",A47,0.5
1,7b9652b17b68b7a4,[abatement],"[act, abating]",A47,0.75
2,36d72442aefd8232,[abatement],"[active, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abatement],"[eliminating, process]",A47,0.5
4,54c1e3b9184cb5b6,[abatement],"[forest, region]",A47,0.0


__Stemming__

In [241]:
from nltk.stem.snowball import SnowballStemmer
snowBallStemmer = SnowballStemmer("english")
df_1['target'] = df_1['target'].apply(lambda x : [snowBallStemmer.stem(word) for word in x])
df_1['anchor'] = df_1['anchor'].apply(lambda x: [snowBallStemmer.stem(word) for word in x])

In [242]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[abat],"[abat, pollut]",A47,0.5
1,7b9652b17b68b7a4,[abat],"[act, abat]",A47,0.75
2,36d72442aefd8232,[abat],"[activ, catalyst]",A47,0.25
3,5296b0c19e1ce60e,[abat],"[elimin, process]",A47,0.5
4,54c1e3b9184cb5b6,[abat],"[forest, region]",A47,0.0


__Need to convert array to just strings__

In [243]:
df_1['anchor'] = df_1['anchor'].apply(lambda x : ','.join(map(str,x)))
df_1['target'] = df_1['target'].apply(lambda x : ','.join(map(str,x)))

In [244]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abat,"abat,pollut",A47,0.5
1,7b9652b17b68b7a4,abat,"act,abat",A47,0.75
2,36d72442aefd8232,abat,"activ,catalyst",A47,0.25
3,5296b0c19e1ce60e,abat,"elimin,process",A47,0.5
4,54c1e3b9184cb5b6,abat,"forest,region",A47,0.0


These the the unique words in the __anchor__ column 10 are shown below

In [245]:
np.unique(np.array(df_1['anchor']))[:10]

array(['abat', 'abnorm,posit', 'absorb,properti', 'acan', 'accept,inform',
       'achiev,authent', 'acid,absorpt', 'ack', 'acoustoopt,modul',
       'acryl,group'], dtype=object)

These the the unique words in the __target__ column 10 are shown below

In [246]:
np.unique(np.array(df_1['target']))[:10]

array(['', '1,amino,2,methoxyethan', '1,azabicyclo', '1,bromopropan',
       '1,methoxi,2,aminoethan', '10,bit,dac',
       '111444,hexafluoro,2,buten', '12,dibromopropan',
       '120,degre,interv', '12cao7al2o3'], dtype=object)

We need to seperate the lengths

In [256]:
anchor_set = set()

In [253]:
def separator(arr):
    for i in arr:
        anchor_set.add
        return i
    
    

'wood'

In [247]:
df_1['anchor'] = df_1['anchor'].apply(lambda x: x.split(','))

In [255]:
df_1['anchor'].apply(lambda x: separator(x))

0        abat
1        abat
2        abat
3        abat
4        abat
         ... 
36468    wood
36469    wood
36470    wood
36471    wood
36472    wood
Name: anchor, Length: 36473, dtype: object

In [224]:
df_1['anchor'].apply(lambda x: len(x))

0         4
1         4
2         4
3         4
4         4
         ..
36468    11
36469    11
36470    11
36471    11
36472    11
Name: anchor, Length: 36473, dtype: int64

__Label Encoding__

In [205]:
labelencoder_anchor = LabelEncoder()
labelencoder_anchor.fit(np.unique(np.array(df_1['anchor'])))
# df_1['anchor'] = labelencoder_anchor.transform(np.array(df_1['anchor']))

LabelEncoder()

In [206]:
np.unique(np.array(df_1['anchor']))

array(['abat', 'abnorm,posit', 'absorb,properti', 'acan', 'accept,inform',
       'achiev,authent', 'acid,absorpt', 'ack', 'acoustoopt,modul',
       'acryl,group', 'activ,posit', 'acyl,acid', 'adapt,learn',
       'adapt,linear', 'addit,element', 'adhes,mount', 'adjac,later',
       'adjust,gas,flow', 'adjust,multipl', 'aesthet,effect', 'agit,mean',
       'air,flow,line', 'air,vent,open', 'align,input,shaft',
       'alpha,gypsum', 'alphat', 'aluminium,compound', 'alumino,silic',
       'ammonia,recoveri', 'androgen,receptor,modul',
       'angular,contact,bear', 'anim,fat', 'annular,end', 'annular,neck',
       'antiatherosclerot', 'antidiuret', 'antigen,composit',
       'apart,elong', 'appli,anod,electrod', 'appli,muscl',
       'appli,request', 'appli,tube', 'applic,messag', 'appropri,adjust',
       'aralkynyl', 'arc,type', 'arcad', 'arcuat,mean', 'arm,design',
       'arm,roller', 'arrang,fashion', 'associ,log', 'automat,coffe',
       'auxiliari,water', 'averag,impact', 'avera

In [209]:
df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abat,"abat,pollut",A47,0.5
1,7b9652b17b68b7a4,abat,"act,abat",A47,0.75
2,36d72442aefd8232,abat,"activ,catalyst",A47,0.25
3,5296b0c19e1ce60e,abat,"elimin,process",A47,0.5
4,54c1e3b9184cb5b6,abat,"forest,region",A47,0.0


In [152]:
# df_1['anchor'] = df_1['anchor'].apply(lambda x: [i for i in x] )

In [153]:
# df_1.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,[[abat]],"abat,pollut",A47,0.5
1,7b9652b17b68b7a4,[[abat]],"act,abat",A47,0.75
2,36d72442aefd8232,[[abat]],"activ,catalyst",A47,0.25
3,5296b0c19e1ce60e,[[abat]],"elimin,process",A47,0.5
4,54c1e3b9184cb5b6,[[abat]],"forest,region",A47,0.0


In [211]:
np.array(df_1['anchor'])[0]

'abat'

In [213]:
df_1['anchor'].apply(lambda x: x)

0               abat
1               abat
2               abat
3               abat
4               abat
            ...     
36468    wood,articl
36469    wood,articl
36470    wood,articl
36471    wood,articl
36472    wood,articl
Name: anchor, Length: 36473, dtype: object

In [220]:
def labeler (arr):
    temp = arr.split(",")
    temp = [temp]
    for i in temp:
        print(labelencoder_anchor.transform(i))
    return arr

labeler('wood,articl')

ValueError: y contains previously unseen labels: 'wood'

In [155]:
df_1['anchor'].apply(lambda x: x)

0                [[abat]]
1                [[abat]]
2                [[abat]]
3                [[abat]]
4                [[abat]]
               ...       
36468    [[wood, articl]]
36469    [[wood, articl]]
36470    [[wood, articl]]
36471    [[wood, articl]]
36472    [[wood, articl]]
Name: anchor, Length: 36473, dtype: object

In [35]:
# count_vect = CountVectorizer(analyzer='word')
# corpus =  np.array(df_1['target'])[0]
# X_counts = count_vect.fit_transform(corpus)
# df_1['target'].apply(lambda x: count_vect.fit_transform(x).toarray()[0])
# print(X_counts.shape)
# print(count_vect.get_feature_names())
# print(X_counts.toarray()[0])
# import pandas as pd
# X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())
# X_counts_df.head(10)

__Label Encoder__

In [36]:
# labelencoder_anchor = LabelEncoder()
# labelencoder_target = LabelEncoder()
# labelencoder_context = LabelEncoder()
# df_1['anchor'] = labelencoder_anchor.fit_transform(df_1['anchor'])
# df_1['target'] = labelencoder_target.fit_transform(df_1['target'])
# df_1['context'] = labelencoder_context.fit_transform(df_1['context'])

__Label Encoder Checkpoint__

In [37]:
# df_1.head()

__Preparing test__
* Will be used later for the competition

In [38]:
# test = pd.read_csv("test.csv")
# test['anchor'] = labelencoder_anchor.transform(test['anchor'])
# test['target'] = labelencoder_target.transform(test['target'])
# test['context'] = labelencoder_context.transform(test['context'])
# test.head()
# test_id = np.array(test['id'])
# test.drop('id',axis=1,inplace=True)
# x_test = test

In [197]:
df_1['merger'] = df_1['anchor'] + ","+ df_1['target']

__Spliting (70/30)__
* Here we test/evaluate our models

In [198]:
# x = df_1.drop(['id','score'],axis=1).values
x = df_1['merger']
y = df_1['score']

In [199]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=141)

In [200]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
count_vect = CountVectorizer()
tf_transformer = TfidfTransformer(use_idf=False)
x_train_counts = count_vect.fit_transform(x_train)
x_train_tf = tf_transformer.fit_transform(x_train_counts)
#df_1['target'] = df_1['target'].apply(lambda x: ["".join(tokens) for tokens in x ])

#X_counts = count_vect.fit_transform(corpus)
# print(X_counts.shape)
# print(count_vect.get_feature_names())
# print(X_counts.toarray())

__Random Forest Regressor__

In [201]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(random_state=141)
regr.fit(x_train_tf, y_train)
x_pre_counts = count_vect.transform(x_test)
x_pre_tf = tf_transformer.transform(x_pre_counts)
y_pred = np.round(regr.predict(x_pre_tf),decimals=2)
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

KeyboardInterrupt: 

__Feature Importance__
* This is based on the gini index from the random forest model

In [None]:
# features = [df_1.columns[i] for i in range(1,3)]
# feature_importance_nums = regr.feature_importances_
# feature_importance_df = pd.DataFrame(feature_importance_nums).transpose()
# feature_importance_df.columns = features
# feature_importance_df.head()

__Lasso Regression__

In [202]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.5)
lr.fit(x_train_tf, y_train)
x_pre_counts = count_vect.transform(x_test)
x_pre_tf = tf_transformer.transform(x_pre_counts)
y_pred = np.round(lr.predict(x_pre_tf),decimals=2)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred,squared=False)
mae = mean_absolute_error(y_test,y_pred)
print("Mean Square Error: {} \nRoot Mean Squared Error: {} \nMean Absolute Error: {} ".format(mse,rmse,mae))

Mean Square Error: 0.06716260281484189 
Root Mean Squared Error: 0.2591574865112754 
Mean Absolute Error: 0.21922683238895996 


__Prediction Submission__
* Will be used later for competition

In [None]:
# pd.read_csv("sample_submission.csv").head()

# df_final.head()

# df_final.to_csv("Michael_Woo_Submission.csv",index=False)

# pd.read_csv("Michael_Woo_Submission.csv")