**Import all the needed Libraries**

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import Word
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

**Read the Data from the CSV files in train(X and y) and test(X and y)**

In [2]:
X_train = pd.read_csv('subtaskA_data_all.csv', delimiter = ',')
y_train = pd.read_csv('subtaskA_answers_all.csv', delimiter = ',')
X_test = pd.read_csv('taskA_trial_data.csv', delimiter = ',')
y_test = pd.read_csv('taskA_trial_answer.csv', delimiter = ',')

In [3]:
# Explore the data to check the shape and head to see how data looks like

X_train.head()
X_train.shape
y_train.shape
y_train
# y_test
X_train

Unnamed: 0,id,sent0,sent1
0,0,He poured orange juice on his cereal.,He poured milk on his cereal.
1,1,He drinks apple.,He drinks milk.
2,2,Jeff ran a mile today,"Jeff ran 100,000 miles today"
3,3,A mosquito stings me,I sting a mosquito
4,4,A niece is a person.,A giraffe is a person.
...,...,...,...
9995,9995,Mark ate a big bitter cherry pie,Mark ate a big sweet cherry pie
9996,9996,Gloria wears a cat on her head,Gloria wears a hat on her head
9997,9997,Harry went to the barbershop to have his hair cut,Harry went to the barbershop to have his glass...
9998,9998,Reilly is sleeping on the couch,Reilly is sleeping on the window


In [4]:
  # Exploring the Data
  # mylist.append([X_train.sent0[0], "False"])
  # mylist
  # # X_train.sent0[1]
  # type(y_train)
  # y_train.answer[0] == 0
  # y_train.shape
  # X_train.shape
  # len(y_train)

**Text preprocessing - Lowercasing, Removing stop words, punctuations and lemmatization**

In [5]:
#Making all letters lowercase
X_train['sent0'] = X_train['sent0'].apply(lambda x:" ".join(x.lower() for x in x.split()))
X_train['sent0']
X_train['sent1'] = X_train['sent1'].apply(lambda x:" ".join(x.lower() for x in x.split()))
X_test['sent0'] = X_test['sent0'].apply(lambda x:" ".join(x.lower() for x in x.split()))
X_test['sent1'] = X_test['sent1'].apply(lambda x:" ".join(x.lower() for x in x.split()))
X_train.head()

Unnamed: 0,id,sent0,sent1
0,0,he poured orange juice on his cereal.,he poured milk on his cereal.
1,1,he drinks apple.,he drinks milk.
2,2,jeff ran a mile today,"jeff ran 100,000 miles today"
3,3,a mosquito stings me,i sting a mosquito
4,4,a niece is a person.,a giraffe is a person.


In [6]:
# Removing Punctuation, Symbols
X_train['sent0'] = X_train['sent0'].str.replace('[^\w\s]', ' ')
X_train['sent1'] = X_train['sent1'].str.replace('[^\w\s]', ' ')
X_test['sent0'] = X_test['sent0'].str.replace('[^\w\s]', ' ')
X_test['sent1'] = X_test['sent1'].str.replace('[^\w\s]', ' ')
X_train.head() 

Unnamed: 0,id,sent0,sent1
0,0,he poured orange juice on his cereal,he poured milk on his cereal
1,1,he drinks apple,he drinks milk
2,2,jeff ran a mile today,jeff ran 100 000 miles today
3,3,a mosquito stings me,i sting a mosquito
4,4,a niece is a person,a giraffe is a person


In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
#Remving stopwords using NLTK
stop = stopwords.words('english')
X_train['sent0'] = X_train['sent0'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
X_train['sent1'] = X_train['sent1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
X_test['sent0'] = X_test['sent0'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
X_test['sent1'] = X_test['sent1'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
#Lemmatisation
X_train['sent0'] = X_train['sent0'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
X_train['sent1'] = X_train['sent1'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
X_test['sent0'] = X_test['sent0'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
X_test['sent1'] = X_test['sent1'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [11]:
# Correcting Letter Repetitions
def de_repeat(text):
  pattern = re.compile(r"(.)\1{2,}")
  return pattern.sub(r"\1\1", text)

X_train['sent0'] = X_train['sent0'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
X_train['sent1'] = X_train['sent1'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
X_test['sent0'] = X_test['sent0'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 
X_test['sent1'] = X_test['sent1'].apply(lambda x: " ".join(de_repeat(x) for x in x.split())) 

In [12]:
#Code to find the top 10,000 rarest words appearing in the data
freq1 = pd.Series(' '.join(X_train['sent0']).split()).value_counts()[-10000:]
freq2 = pd.Series(' '.join(X_train['sent1']).split()).value_counts()[-10000:]

freq3 = pd.Series(' '.join(X_test['sent0']).split()).value_counts()[-10000:]
freq4 = pd.Series(' '.join(X_test['sent1']).split()).value_counts()[-10000:]
print(freq1)
print(freq2)

people       511
put          348
water        319
car          313
like         257
            ... 
pearl          1
combining      1
buttered       1
jellyfish      1
overcome       1
Length: 5987, dtype: int64
people        511
put           342
water         333
car           311
like          262
             ... 
corrected       1
choosing        1
supplement      1
creating        1
captain         1
Length: 5953, dtype: int64


In [13]:
# X_train['sent0'] = X_train['sent0'].apply(lambda x: " ".join( x for x in x.split() if x not in freq1))
# X_train['sent1'] = X_train['sent1'].apply(lambda x: " ".join( x for x in x.split() if x not in freq2))

# X_test['sent0'] = X_test['sent0'].apply(lambda x: " ".join( x for x in x.split() if x not in freq3))
# X_test['sent1'] = X_test['sent1'].apply(lambda x: " ".join( x for x in x.split() if x not in freq4))
# X_train

**Creating a list mylist combining all the sentences in sen0 and sen1 and making sure we don't loose context which is the nonsense sentence (using False for nonsense sentence).**

 This is being done so we get the data ready for passing into TFIDF. TFIDF is a vectorizer which converts Categorical data to Numerical data. 
 
 The reason of converting the data to Numerical Data is so that we can apply Machine Learning Model on it, as Machine Learning model can't be applied on Categorical Data.

**And the Machine Learning Model will do the Classification for us, in this case, given a sentence it will tell us if it nonsense or not**

In [14]:
# list_correct_sen = []
# list_nonsense_sen = []
mylist = []

for i in range(len(y_train)):
    # print(i)
    if(y_train.answer[0] == 0):
         mylist.append([X_train.sent0[i], "False"])
         mylist.append([X_train.sent1[i], "True"])
    if(y_train.answer[0] == 1):
         mylist.append([X_train.sent1[i], "False"])   
         mylist.append([X_train.sent0[i], "False"])     
print(mylist)

mylist2 = []

for i in range(len(y_test)):
    # print(i)
    if(y_test.answer[0] == 0):
         mylist2.append([X_test.sent0[i], "False"])
         mylist2.append([X_test.sent1[i], "True"])
    if(y_test.answer[0] == 1):
         mylist2.append([X_test.sent1[i], "False"])   
         mylist2.append([X_test.sent0[i], "False"])     
print(mylist2)

print(mylist[0])

[['put elephant fridge', 'False'], ['put turkey fridge', 'True'], ['sister eats apple breakfast every day', 'False'], ['sister eats stone breakfast every day', 'True'], ['money used buying car', 'False'], ['money used buying star', 'True'], ['new york located northeastern part usa', 'False'], ['usa located northeastern part new york', 'True'], ['man better see star moon daytime', 'False'], ['man hardly see star moon daytime', 'True'], ['sent restaurant treatment car crash', 'False'], ['sent hospital treatment car crash', 'True'], ['mother became satisfied got good grade math exam', 'False'], ['mother became angry got good grade math exam', 'True'], ['can usually made gold', 'False'], ['can usually made aluminum', 'True'], ['put desktop suitcase departure', 'False'], ['put laptop suitcase departure', 'True'], ['walk moon', 'False'], ['walk school', 'True'], ['work 25 hour day', 'False'], ['work 8 hour day', 'True'], ['hungry water', 'False'], ['hungry food', 'True'], ['bucket hold one g

In [24]:
# The output from above is in list format and fit_transform requires numpy array, so we convert into the same
myarr = np.array(mylist)
myarr1 = np.array(mylist2)

myarr

array([['poured orange juice cereal', 'False'],
       ['poured milk cereal', 'True'],
       ['drink apple', 'False'],
       ...,
       ['reilly sleeping window', 'True'],
       ['desk lamp', 'False'],
       ['lamp desk', 'True']], dtype='<U116')

In [25]:
myarr[:,0]

array(['poured orange juice cereal', 'poured milk cereal', 'drink apple',
       ..., 'reilly sleeping window', 'desk lamp', 'lamp desk'],
      dtype='<U116')

In [16]:
#Encoding output labels, mL can't handle words, convert to numbers
lbl_enc = preprocessing.LabelEncoder()
y_train = lbl_enc.fit_transform(myarr[:,1])
y_test = lbl_enc.fit_transform(myarr1[:,1])

TF-IDF Vectorizer to convert the Features or X_train and X_test data to numerical data. Using hyperparameter ngram_range with dfferent values 1,2 and 3.

Unigrams i.e. only capture single words, thus don't capture probability of the next word

Bigram finds the probability of a given word given a previous word. Thus Bigram performs slightly better than Unigram.

Trigram finds the probability of a given word given past 2 words. Trigram performs better than Bigram

We have used all these 3 Ngrams to create different Vectors and applied the same to the models to observe the difference in accuracies.

In [17]:
#Extracting TF-IDF parameters
tfidf_unigram = TfidfVectorizer(max_features=1000, lowercase=True, analyzer ='word', ngram_range=(1,1))
tfidf_bigram = TfidfVectorizer(max_features=1000, lowercase=True, analyzer ='word', ngram_range=(1,2))
tfidf_trigram = TfidfVectorizer(max_features=1000, lowercase=True, analyzer ='word', ngram_range=(1,3))

X_train_tfidf_unigram = tfidf_unigram.fit_transform(myarr[:,0])
X_test_tfidf_unigram = tfidf_unigram.fit_transform(myarr1[:,0])

X_train_tfidf_bigram = tfidf_bigram.fit_transform(myarr[:,0])
X_test_tfidf_bigram = tfidf_bigram.fit_transform(myarr1[:,0])

X_train_tfidf_trigram = tfidf_trigram.fit_transform(myarr[:,0])
X_test_tfidf_trigram = tfidf_trigram.fit_transform(myarr1[:,0])

#X_train_tfidf_trigram

In [18]:
X_train_tfidf_trigram.shape

(20000, 1000)

In [19]:
y_train.shape

(20000,)

In [20]:
# y_train = y_train.reshape((20000,1))
# y_train.shape

**Now we have data in the required format to apply ML Classification Models. We will use 3 different models and check accuracy of each.**

In [21]:
#Model: Multinomial Naive Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf_unigram, y_train)
y_pred_unigram = nb.predict(X_test_tfidf_unigram)
print('naive bayes tfidf accuracy with unigram is %s' % accuracy_score(y_pred_unigram, y_test) )

nb.fit(X_train_tfidf_bigram, y_train)
y_pred_bigram = nb.predict(X_test_tfidf_bigram)
print('naive bayes tfidf accuracy with bigram is %s' % accuracy_score(y_pred_bigram, y_test) )

nb.fit(X_train_tfidf_trigram, y_train)
y_pred_trigram = nb.predict(X_test_tfidf_trigram)
print('naive bayes tfidf accuracy with trigram is %s' % accuracy_score(y_pred_trigram, y_test) )

# naive bayes tfidf accuracy with unigram is 0.4962889658584859
# naive bayes tfidf accuracy with bigram is 0.500247402276101
# naive bayes tfidf accuracy with trigram is 0.5029688273132112


naive bayes tfidf accuracy with unigram is 0.4962889658584859
naive bayes tfidf accuracy with bigram is 0.500247402276101
naive bayes tfidf accuracy with trigram is 0.5029688273132112


In [22]:
#Using another Model - KNN  with n =3, why 3?
# n performs well when its not low value or too high value, so 3 is a popular choice and works generally well

from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(X_train_tfidf_unigram, y_train)
y_pred_unigram = nb.predict(X_test_tfidf_unigram)
print('KNN tfidf accuracy with bigram is %s' % accuracy_score(y_pred_unigram, y_test) )

neigh.fit(X_train_tfidf_bigram, y_train)
y_pred_bigram = nb.predict(X_test_tfidf_bigram)
print('KNN tfidf accuracy with bigram is %s' % accuracy_score(y_pred_bigram, y_test) )

neigh.fit(X_train_tfidf_trigram, y_train)
y_pred_trigram = nb.predict(X_test_tfidf_trigram)
print('KNN tfidf accuracy with trigram is %s' % accuracy_score(y_pred_trigram, y_test) )

# KNN tfidf accuracy with bigram is 0.5079168728352301
# KNN tfidf accuracy with bigram is 0.512370113805047
# KNN tfidf accuracy with trigram is 0.5029688273132112

KNN tfidf accuracy with bigram is 0.5079168728352301
KNN tfidf accuracy with bigram is 0.512370113805047
KNN tfidf accuracy with trigram is 0.5029688273132112


In [23]:
# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
randomForestClf = RandomForestClassifier()

randomForestClf.fit(X_train_tfidf_unigram, y_train) 
y_pred_unigram = nb.predict(X_test_tfidf_unigram)
print('random forest tfidf accuracy with unigram is %s' % accuracy_score(y_pred_unigram, y_test) )

randomForestClf.fit(X_train_tfidf_bigram, y_train)
y_pred_bigram = nb.predict(X_test_tfidf_bigram)
print('random forest tfidf accuracy with bigram is %s' % accuracy_score(y_pred_bigram, y_test) )

randomForestClf.fit(X_train_tfidf_trigram, y_train)
y_pred_trigram = nb.predict(X_test_tfidf_trigram)
print('random forest tfidf accuracy with trigram is %s' % accuracy_score(y_pred_trigram, y_test) )

# random forest tfidf accuracy with unigram is 0.5079168728352301
# random forest tfidf accuracy with bigram is 0.512370113805047
# random forest tfidf accuracy with trigram is 0.5029688273132112


KeyboardInterrupt: ignored

**Conclusion - We tried 3 different Models - Multinomial NB, KNN and Random Forest Classifier and we found that the Multinomial NB performed the best out of these 3 with 50.4% accuracy**

50.4% accuracy means that we could calssify almost 50% sentences correctly as nonsense sentences