# FEATURE CALCULATION

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB,ComplementNB
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from time import time
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv('WikiLarge_Train.csv')
test_data = pd.read_csv('WikiLarge_Test.csv')
sample_submission = pd.read_csv('sampleSubmission.csv')

## 1 : Tokenising the text and part of speech tagging
- Done using NTLK
- The text was divided into tokens
- The part of speech for every POS token was found using pos_tag from ntlk
- The pos tags of every word in the sentence are stored in the form of a dictionary.So I have changed the dictionary to series so that every part of speech tag becomes a column.


In [None]:
import nltk
from collections import Counter
from nltk.tokenize import RegexpTokenizer
tic = time()
tokenizer = RegexpTokenizer(r'\w+')
train_data["original_text"]=train_data["original_text"].str.lower()
train_data['tokenized_text'] = train_data["original_text"].apply(tokenizer.tokenize)
train_data['pos_tokens'] = train_data['tokenized_text'].apply(nltk.pos_tag)
test_data["original_text"]=test_data["original_text"].str.lower()
test_data['tokenized_text'] = test_data["original_text"].apply(tokenizer.tokenize)
test_data['pos_tokens'] = test_data['tokenized_text'].apply(nltk.pos_tag)
# train_data['num_unique']=train_data['tokenized_text'].apply(pd.Series.nunique)
toc = time()
print(f"Done in {toc - tic:.3f}s")


## Calculating numeric features from plain text
- The text was syllabified using the pyhyphen library and then features like 
    - number of words with one syllable
    - number of words with two syllables
    - number of words with three syllables
    - number of words with four syllables
    - number of words with five syllables
    - lexical diversity was calculated
- We had the pos tag for every word in the input from the previous step.We also calculated the number of words under each POS tag for every input sample

In [None]:
import pyphen
syllabifier = pyphen.Pyphen(lang='en')
#lexical diversity
def lexical_diversity(row):
#     print(len(set(row['pos_tokens']))/float(len('pos_tokens')))
    return len(set(row['pos_tokens']))/float(len('pos_tokens'))

#counting the number of words under each tag
def tagcounter(row):
    return Counter(tag for word, tag in row["pos_tokens"])
def sentence(row):
    return list(map(syllabify,row["tokenized_text"]))
def syllabify(a):
    return syllabifier.inserted(a).count('-')+1

def one_syllable_count(row):
    return row["syllabified"].count(1)
def two_syllable_count(row):
    return row["syllabified"].count(2)

def three_syllable_count(row):
#     print("-")
    return row["syllabified"].count(3)

def four_syllable_count(row):
    return row["syllabified"].count(4)

def five_syllable_count(row):
    return row["syllabified"].count(5)
def num(row):
    return len(row["tokenized_text"])

In [None]:
def calculate_numeric_features(data):
    data["lexical_diversity"]=data.apply(lambda row:lexical_diversity(row),axis=1)
    data["pos_count"]=data.apply(lambda row:tagcounter(row),axis=1)
    verb_cols = [col for col in data.columns if 'verb' in col and "adverb" not in col]
    noun_cols=[col for col in data.columns if 'noun' in col and "pronoun" not in col]
    pronoun_cols=[col for col in data.columns if "pronoun" in col]
    data["verb_count"]=data[verb_cols].sum(axis = 1, skipna = True)
    data["noun_count"]=data[noun_cols].sum(axis = 1, skipna = True)
    data["pronoun_count"]=data[pronoun_cols].sum(axis = 1, skipna = True)
    data["syllabified"]=data.apply(lambda row:sentence(row),axis=1)
    data["1_syllable_count"]=data.apply(lambda row:one_syllable_count(row),axis=1)
    data["2_syllable_count"]=data.apply(lambda row:two_syllable_count(row),axis=1)
    data["3_syllable_count"]=data.apply(lambda row:three_syllable_count(row),axis=1)
    data["4_syllable_count"]=data.apply(lambda row:four_syllable_count(row),axis=1)
    data["5_syllable_count"]=data.apply(lambda row:five_syllable_count(row),axis=1)
    data['length']=len(data['tokenized_text'])
    data["length"]=data.apply(lambda row:num(row),axis=1)
    return data

In [None]:
train_data=calculate_numeric_features(train_data)
test_data=calculate_numeric_features(test_data)

## Calculating features from the part of speech tags
- We have renamed the columns in the dataframe for easier understanding

In [None]:
pos_counts_train_data=train_data["pos_count"].apply(pd.Series)
pos_counts_test_data=test_data["pos_count"].apply(pd.Series)
pos_counts_train_data.rename(columns={"DT":"determiner","CD":"numeral",\
                                                "CC":"conjunction","EX": "existential",\
                                                "VBZ": "verb_present_3s","JJ":"ordinal",\
                                                "NN": "noun_common","WDT": "WH_determiner",\
                                                "VBN": "verb_past_participle",\
                                                "CC": "conjunction","CD": "numeral",\
                                                "DT": "determiner","IN":"preposition","JJR": "adjective_comparative",\
                                                "JJS": "adjective_superlative","LS": "list",\
                                                "MD": "modal_auxiliary","NNP": "noun_singular",\
                                                "NNS": "noun_common_plural","PDT": "pre_determiner",\
                                                "POS": "genitive_marker","PRP": "pronoun_personal",\
                                                "PRP$": "pronoun_possessive",\
                                                "RB": "adverb","RBR": "adverb_comparative",\
                                                "RBS": "adverb_superlative","RP": "particle",\
                                                "TO":"to_preposition","UH": "interjection",\
                                                "VB": "verb_base","VBD":"verb_past",\
                                                "VBG":"gerund","VBP": "verb_present_3s",\
                                                "VBZ": "verb_present_3s","WP": "WH_pronoun",
                                                "WRB": "Wh_adverb","WP$":"possessive_wh_pronoun",\
                            "FW":"foreign word","SYM":"symbol",'NNPS':"proper_noun"},inplace=True)
pos_counts_test_data.rename(columns={"DT":"determiner","CD":"numeral",\
                                                "CC":"conjunction","EX": "existential",\
                                                "VBZ": "verb_present_3s","JJ":"ordinal",\
                                                "NN": "noun_common","WDT": "WH_determiner",\
                                                "VBN": "verb_past_participle",\
                                                "CC": "conjunction","CD": "numeral",\
                                                "DT": "determiner","IN":"preposition","JJR": "adjective_comparative",\
                                                "JJS": "adjective_superlative","LS": "list",\
                                                "MD": "modal_auxiliary","NNP": "noun_singular",\
                                                "NNS": "noun_common_plural","PDT": "pre_determiner",\
                                                "POS": "genitive_marker","PRP": "pronoun_personal",\
                                                "PRP$": "pronoun_possessive",\
                                                "RB": "adverb","RBR": "adverb_comparative",\
                                                "RBS": "adverb_superlative","RP": "particle",\
                                                "TO":"to_preposition","UH": "interjection",\
                                                "VB": "verb_base","VBD":"verb_past",\
                                                "VBG":"gerund","VBP": "verb_present_3s",\
                                                "VBZ": "verb_present_3s","WP": "WH_pronoun",
                                                "WRB": "Wh_adverb","WP$":"possessive_wh_pronoun",\
                            "FW":"foreign word","SYM":"symbol",'NNPS':"proper_noun"},inplace=True)



## Counting the number of verbs,nouns and pronouns from the part of speech data
- This was done by adding the contents of the column that had verbs in it,that had nouns in it etc.
- The part of speech features were calculated seperately for the train and test data and stored in seperate .csv files so that it not be calculated every single time

In [None]:
verb_cols = [col for col in pos_counts_train_data.columns if 'verb' in col and "adverb" not in col]
noun_cols=[col for col in pos_counts_train_data.columns if 'noun' in col and "pronoun" not in col]
pronoun_cols=[col for col in pos_counts_train_data.columns if "pronoun" in col]
pos_counts_train_data["verb_count"]=pos_counts_train_data[verb_cols].sum(axis = 1, skipna = True)
pos_counts_train_data["noun_count"]=pos_counts_train_data[noun_cols].sum(axis = 1, skipna = True)
pos_counts_train_data["pronoun_count"]=pos_counts_train_data[pronoun_cols].sum(axis = 1, skipna = True)

verb_cols = [col for col in pos_counts_test_data.columns if 'verb' in col and "adverb" not in col]
noun_cols=[col for col in pos_counts_test_data.columns if 'noun' in col and "pronoun" not in col]
pronoun_cols=[col for col in pos_counts_test_data.columns if "pronoun" in col]
pos_counts_test_data["verb_count"]=pos_counts_test_data[verb_cols].sum(axis = 1, skipna = True)
pos_counts_test_data["noun_count"]=pos_counts_test_data[noun_cols].sum(axis = 1, skipna = True)
pos_counts_test_data["pronoun_count"]=pos_counts_test_data[pronoun_cols].sum(axis = 1, skipna = True)

In [None]:
pos_counts_train_data.to_csv("feature2/pos_tag_traindata.csv",index=False)
pos_counts_test_data.to_csv("feature2/pos_tag_testdata.csv",index=False)

## Calculating features from the AoA file 
- From the AoA file two types of features were calculated,the aggregated features for all words in the sentence and count features.
- For every input sample the number of words in common with the AoA file was calculated.

### Aggregated Features

In [None]:
AoA=pd.read_csv("AoA_51715_words.csv")
AoA=AoA.fillna(0)
AoA.columns=AoA.columns.str.lower()
words_in_AoA=set(AoA.word)
AoA=AoA.set_index('word')
words_in_aoa=[]
num_in_aoa=[]

In [None]:
def AOA_words(data):
    words_in_aoa=[]
    num_in_aoa=[]
    for i in range(len(data)):
        words=set(data.iloc[i]["tokenized_text"])
        p=list(set(words) & set(words_in_AoA))
        words_in_aoa.append(p)
        num_in_aoa.append(len(p))
        if(i%10000==0):       
            print("10k done")
    data["aoa_words"]=words_in_aoa
    data["num_words_AoA"]=num_in_aoa
train_data=AOA_words(train_data)
test_data=AOA_words(test_data)

In [None]:
def AOA_aggregated_features(data):
    sentence_aoa=pd.DataFrame()
    for i in range(len(data)):
        word_aoa=pd.DataFrame()
        for word in data.iloc[i]["aoa_words"]:
            word_aoa=word_aoa.append(AoA.loc[word])
            word_aoa=word_aoa[["aoa_bird_lem","aoa_bristol_lem","aoa_cort_lem","aoa_kup","aoa_kup_lem","aoa_schock","freq_pm","nletters",  "nphon",  "nsyll", "perc_known","perc_known_lem"]]
        if(i%1000==0):
            print("1k done")
        sentence_aoa=sentence_aoa.append(word_aoa.mean(axis=0),ignore_index=True)
    return(sentence_aoa)

In [None]:
train_aoa_aggregated=AOA_aggregated_features(train_data)
test_aoa_aggregated=AOA_aggregated_features(train_data)
train_aoa_aggregated.to_csv("feature2/aoa_features_traindata.csv",index=False)
test_aoa_aggregated.to_csv("feature2/aoa_features_testdata.csv",index=False)

### Numerical Features
- The number of words in the input sample which has 
    - less than 5 phenomes
    - less than 10 phenomes
    - less than 15 phenomes 
    - less than 20 phenomes
- The number of words in the input sample for which the percentage known is
    - less than 20%
    - less than 50%
    - greater than 90%
    - greater than 75%
- The number of words in the input sample for which the Estimated AoA based on Kuperman et al. study is
    - less than 10
    - less than 20
    - greater than 20
     

In [None]:
words_less5_phonemes=[]     
words_less10_phonemes=[]     
words_less15_phonemes=[]     
words_less20_phonemes=[]     
perc_known_less_20=[]     
perc_known_less_50=[]     
perc_known_greater_90=[]     
perc_known_greater_75=[]     
aoa_kup_lem_less_10=[]     
aoa_kup_lem_less_20=[]     
aoa_kup_lem_greater_20=[]     
words_less5_phonemes_AoA=set(AoA[AoA["nphon"]<=5]["word"])
words_less10_phonemes_AoA=set(AoA[(AoA["nphon"]>5) &(AoA["nphon"]<=10)]["word"])
words_less15_phonemes_AoA=set(AoA[(AoA["nphon"]>10) & AoA["nphon"]<=15]["word"])
words_less20_phonemes_AoA=set(AoA[(AoA["nphon"]>15) & AoA["nphon"]<=20]["word"])
perc_known_less_20_AoA=set(AoA[AoA['perc_known']<=0.20]["word"])
perc_known_less_50_AoA=set(AoA[AoA['perc_known']<=0.50]["word"])
perc_known_greater_90_AoA=set(AoA[AoA['perc_known']>=0.90]["word"])
perc_known_greater_75_AoA=set(AoA[AoA['perc_known']>=0.75]["word"])
aoa_kup_lem_less_10_AoA=set(AoA[AoA['aoa_kup_lem']<=10]["word"])
aoa_kup_lem_less_20_AoA=set(AoA[AoA['aoa_kup_lem']<=20]["word"])
aoa_kup_lem_greater_20_AoA=set(AoA[AoA['aoa_kup_lem']>=20]["word"])

In [None]:
def aoa_count_features(data):
    AoA_countfeatures_data=pd.DataFrame()
    for i in range(len(data)):
    words=set(data.iloc[i]["aoa_words"])
    words_less5_phonemes.append(len(set(words) & words_less5_phonemes_AoA))
    words_less10_phonemes.append(len(set(words) & words_less10_phonemes_AoA))
    words_less15_phonemes.append(len(set(words) & words_less15_phonemes_AoA))
    words_less20_phonemes.append(len(set(words) & words_less20_phonemes_AoA))
    perc_known_less_20.append(len(set(words) & perc_known_less_20_AoA))
    perc_known_less_50.append(len(set(words) & perc_known_less_50_AoA))
    perc_known_greater_90.append(len(set(words) & perc_known_greater_90_AoA ))
    perc_known_greater_75.append(len(set(words) & perc_known_greater_75_AoA ))
    aoa_kup_lem_less_10.append(len(set(words) & aoa_kup_lem_less_10_AoA))
    aoa_kup_lem_less_20.append(len(set(words) & aoa_kup_lem_less_20_AoA))
    aoa_kup_lem_greater_20.append(len(set(words) & aoa_kup_lem_greater_20_AoA))
    if(i%10000==0):       
        print("10k done")
    AoA_countfeatures_data["words_less5_phonemes"]=words_less5_phonemes
    AoA_countfeatures_data["words_less10_phonemes"]=words_less10_phonemes
    AoA_countfeatures_data["words_less15_phonemes"]=words_less15_phonemes
    AoA_countfeatures_data["words_less20_phonemes"]=words_less20_phonemes
    AoA_countfeatures_data["perc_known_less_20"]=perc_known_less_20
    AoA_countfeatures_data["perc_known_less_50"]=perc_known_less_50
    AoA_countfeatures_data["perc_known_greater_90"]=perc_known_greater_90
    AoA_countfeatures_data["perc_known_greater_75"]=perc_known_greater_75
    AoA_countfeatures_data["aoa_kup_lem_less_10"]=aoa_kup_lem_less_10
    AoA_countfeatures_data["aoa_kup_lem_less_20"]=aoa_kup_lem_less_20
    AoA_countfeatures_data["aoa_kup_lem_greater_20"]=aoa_kup_lem_greater_20

In [None]:
AoA_countfeatures_traindata=aoa_count_features(train_data)
AoA_countfeatures_testdata=aoa_count_features(test_data)
AoA_countfeatures_traindata.to_csv("AoA_countfeatures_traindata.csv",index=False)
AoA_countfeatures_testdata.to_csv("AoA_countfeatures_testdata.csv",index=False)

## Features calculated from concreteness
- From the Concreteness_ratings_Brysbaert_et_al_BRM.txtfile two types of features were calculated,the aggregated features for all words in the sentence and count features.
- For every input sample the number of words in common with the AoA file was calculated.





In [None]:
concreteness=pd.read_csv("Concreteness_ratings_Brysbaert_et_al_BRM.txt",sep = '\t')
concreteness=concreteness.fillna(0)
concreteness.columns=concreteness.columns.str.lower()
wordsconcreteness=set(concreteness.word)

In [None]:
concreteness.columns

In [None]:
concreteness.describe()

In [None]:
words_in_concreteness=[]
num_in_concreteness=[]
for i in range(len(train_data)):
    words=set(train_data.iloc[i]["tokenized_text"])
    p=list(set(words) & set(wordsconcreteness))
    words_in_concreteness.append(p)
    num_in_concreteness.append(len(p))
    if(i%100000==0):       
        print("100k done")
train_data["concreteness_words"]=words_in_concreteness
train_data["num_words_concreteness"]=num_in_concreteness

In [None]:
words_in_concreteness=[]
num_in_concreteness=[]
for i in range(len(test_data)):
    words=word_tokenize(test_data.iloc[i]["original_text"])
    p=list(set(words) & set(wordsconcreteness))
    words_in_concreteness.append(p)
    num_in_concreteness.append(len(p))
    if(i%100000==0):       
        print("100k done")
test_data["concreteness_words"]=words_in_concreteness
test_data["num_words_concreteness"]=num_in_concreteness

### Numerical Features

- The number of words in the input sample which has 
    - less than 0.5 bigram
    - greater than 0.5 bigram
    - less than 15 phenomes 
    - less than 20 phenomes
- The number of words in the input sample for which the mean concreteness rating is
    - less than 2
    - greater than 2
- The number of words in the input sample for which the standard deviation of the concreteness ratings is
    - less than 2.25
    - greater than 2.25
- The number of words in the input sample for which the number of people not knowing the word is
    - less than 50
    - greater than 50
    - greater than 100
- The number of words in the input sample for which the Percentage of participants who knew the word is
    - less than 95%
    - greater than 95%

In [None]:
words_less_point5_bigram=[]
words_greater_point5_bigram=[]
words_less2_concm=[]
words_greater2_concm=[]     
words_less125_consd=[]     
words_great125_consd=[] 
words_great100_unknown=[]    
words_less100_unknown=[]    
total_less50=[]
total_less100=[]
total_great100=[]
percent_knownless95=[]
percent_knowngreater95=[]


conc_words_less_point5_bigram=set(wordsconcreteness & set(concreteness[concreteness["bigram"]<0.5]["word"]))
conc_words_greater_point5_bigram=set(wordsconcreteness & set(concreteness[concreteness["bigram"]>=0.5]["word"]))
conc_words_less2_concm=set(wordsconcreteness & set(concreteness[concreteness["conc.m"]<2]["word"]))
conc_words_greater2_concm=set(wordsconcreteness & set(concreteness[concreteness["conc.m"]>=2]["word"]) )    
conc_words_less125_consd=set(wordsconcreteness & set(concreteness[concreteness["conc.sd"]<2.25]["word"]))     
conc_words_great125_consd=set(wordsconcreteness & set(concreteness[concreteness["conc.sd"]>1.25]["word"])) 
conc_words_great100_unknown=set(wordsconcreteness & set(concreteness[concreteness["unknown"]>=100]["word"]))    
conc_words_less100_unknown=set(wordsconcreteness & set(concreteness[concreteness["unknown"]<100]["word"]))    
conc_total_less50=set(wordsconcreteness & set(concreteness[concreteness["total"]<=50]["word"]))
conc_total_less100=set(wordsconcreteness & set(concreteness[concreteness["total"]<100]["word"]))
conc_total_great100=set(wordsconcreteness & set(concreteness[concreteness["total"]>=100]["word"]))
conc_percent_knownless95=set(wordsconcreteness & set(concreteness[concreteness["percent_known"]<0.95]["word"]))
conc_percent_knowngreater95=set(wordsconcreteness & set(concreteness[concreteness["percent_known"]>=0.95]["word"]))



In [None]:
def conc_numerical(data):
    conc_countfeatures_data=pd.DataFrame()
    for i in range(len(data)):
        words=set(data.iloc[i]["concreteness_words"])
        words_less_point5_bigram.append(len(set(words)&conc_words_less_point5_bigram))
        words_greater_point5_bigram.append(len(set(words)&conc_words_greater_point5_bigram))
        words_less2_concm.append(len(set(words)&conc_words_less2_concm))
        words_greater2_concm.append(len(set(words)&conc_words_greater2_concm))     
        words_less125_consd.append(len(set(words)&conc_words_less125_consd))     
        words_great125_consd.append(len(set(words)&conc_words_great125_consd)) 
        words_great100_unknown.append(len(set(words)&conc_words_great100_unknown))    
        words_less100_unknown.append(len(set(words)&conc_words_less100_unknown))    
        total_less50.append(len(set(words)&conc_total_less50))
        total_less100.append(len(set(words)&conc_total_less100))
        total_great100.append(len(set(words)&conc_total_great100))
        percent_knownless95.append(len(set(words)&conc_percent_knownless95))
        percent_knowngreater95.append(len(set(words)&conc_percent_knowngreater95))
        if(i%10000==0):       
            print("10k done")
    conc_countfeatures_data["words_less_point5_bigram"]=words_less_point5_bigram
    conc_countfeatures_data["words_greater_point5_bigram"]= words_greater_point5_bigram
    conc_countfeatures_data["words_less2_concm"]=words_less2_concm
    # conc_countfeatures_ata["words_less20_phonemes"]=words_less20_phonemes
    conc_countfeatures_data["words_greater2_concm"]=words_greater2_concm
    conc_countfeatures_data["words_less125_consd"]=words_less125_consd
    conc_countfeatures_data["words_great125_consd"]=words_great125_consd
    conc_countfeatures_data["words_great100_unknown"]=words_great100_unknown
    conc_countfeatures_data["words_less100_unknown"]=words_less100_unknown
    conc_countfeatures_data["total_less50"]=total_less50
    conc_countfeatures_data["total_less100"]=total_less100
    conc_countfeatures_data["total_great100"]=total_great100
    conc_countfeatures_data["percent_knownless95"]=percent_knownless95
    conc_countfeatures_data["percent_knowngreater95"]=percent_knowngreater95
    return conc_countfeatures_data


In [None]:
conc_countfeatures_traindata=conc_numerical(train_data)
conc_countfeatures_testdata=conc_numerical(test_data)
conc_countfeatures_traindata.to_csv("feature2/concreteness_numerical_features_traindata.csv",index=False)
conc_countfeatures_testdata.to_csv("feature2/concreteness_numerical_features_testdata.csv",index=False)

In [None]:
train_data[['tokenized_text',"concreteness_words",'aoa_words','pos_tokens']].to_csv("feature2/words_traindata.csv",index=False)
test_data[['tokenized_text',"concreteness_words",'aoa_words','pos_tokens']].to_csv("feature2/words_testdata.csv",index=False)

### Aggregated Features 

In [None]:
sentence_concreteness=pd.DataFrame()
for i in range(len(train_data)):
    word_concreteness=pd.DataFrame()
    for word in train_data.iloc[i]["concreteness_words"]:
        word_concreteness=word_concreteness.append(concreteness.loc[word])
        word_concreteness=word_concreteness[["bigram","conc.m","conc.sd","unknown","total","percent_known","subtlex","dom_pos"]]
    if(i%10000==0):
        print("10k done")
    sentence_concreteness=sentence_concreteness.append(word_concreteness.mean(axis=0),ignore_index=True)

In [None]:
sentence_concreteness.to_csv("feature2/concreteness_features_traindata.csv",index=False)

In [None]:
sentence_concreteness=pd.DataFrame()
for i in range(len(test_data)):
    word_concreteness=pd.DataFrame()
    for word in test_data.iloc[i]["concreteness_words"]:
        word_concreteness=word_concreteness.append(concreteness.loc[word])
        word_concreteness=word_concreteness[["bigram","conc.m","conc.sd","unknown","total","percent_known","subtlex","dom_pos"]]
    sentence_concreteness=sentence_concreteness.append(word_concreteness.mean(axis=0),ignore_index=True)

In [None]:
sentence_concreteness.to_csv("feature2/concreteness_features_testdata.csv",index=False)