## Importing important libraries and reading the training and testing data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train_df = pd.read_csv("../input/datafiles/train_df.csv")
test_df = pd.read_csv("../input/datafiles/test_df.csv")

## EDA of training data

### Checking for null values, invalid target values and duplicated rows

In [3]:
train_df.describe()

Unnamed: 0,target
count,1000000.0
mean,0.06187
std,0.240919
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
train_df.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [5]:
train_df["target"].unique()

array([0, 1])

In [6]:
train_df.index[train_df.duplicated()]

Int64Index([], dtype='int64')

## EDA of testing data

### Checking for null values and duplicated rows

In [7]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


In [8]:
test_df.isna().sum()

qid              0
question_text    0
dtype: int64

In [9]:
test_df.index[test_df.duplicated()]

Int64Index([], dtype='int64')

## Preprocessing training and testing data

### Cleaning the text
Common data cleaning steps on all text:

1.Make text all lower case

2.Remove punctuation

3.Remove numerical values

4.Remove common non-sensical text (/n)

5.Tokenize text

6.Remove stop words

7.Stemming / lemmatization


### Here (in round 1) we are doing the following things:-
1. Removing text in square brackets
2. Removing punctuation marks from the text
3. Removing words containing numbers.

In [10]:
def clean_text_round1(text):
    '''remove text in square brackets, remove punctuation and remove words containing numbers.'''
    # text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
train_df.question_text= train_df.question_text.apply(round1)
test_df.question_text= test_df.question_text.apply(round1)

### Here (in round2) we are doing:-
1. Getting rid of additional punctuation
2. Removing some non-sensical text
3. Removing urls
4. Removing HTML tags

In [12]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(re.compile('<.*?>') , '', text)

    return text
round2 = lambda x: clean_text_round2(x)

In [13]:
train_df.question_text= train_df.question_text.apply(round2)
test_df.question_text= test_df.question_text.apply(round2)
test_df.question_text

0         My period is due on my wedding day How can I s...
1         How many numbers higher than a million can be ...
2         How come I feel nothing for my family but stil...
3         In case of collapse of the Democratic party wi...
4                                     Who is Émile Naoumoff
                                ...                        
306117    Did anyone get an update on Maruti Suzuki All ...
306118    What  people in history do you find the most i...
306119              How can I remove the tan on my forehead
306120    If you are a well known hacker will you be mor...
306121    If your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

### Tokenization
Tokenization is the process of segmenting running text into sentences and words. In essence, it’s the task of cutting a text into pieces called tokens. 
$\newline$ Here we are going to use word tokenizer i.e. the words are the tokens

In [14]:
def tokenization(text):
    return word_tokenize(text)
# applying function to the column
train_df['question_text']= train_df['question_text'].apply(lambda x: tokenization(x))
test_df['question_text']= test_df['question_text'].apply(lambda x: tokenization(x))

In [15]:
train_df.question_text

0         [What, are, interesting, facts, about, Microso...
1         [What, are, those, things, which, are, not, go...
2         [What, should, I, know, to, avoid, being, upso...
3          [How, I, add, any, account, with, payment, bank]
4         [Which, Multi, level, marketing, products, are...
                                ...                        
999995                     [How, is, CSE, at, VIT, Chennai]
999996    [How, can, we, prevent, a, holocaust, by, robo...
999997    [How, can, I, help, a, student, remember, key,...
999998    [What, is, the, difference, between, lace, clo...
999999    [What, happens, when, you, look, into, a, brok...
Name: question_text, Length: 1000000, dtype: object

In [16]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,"[My, period, is, due, on, my, wedding, day, Ho..."
1,9914c62ed3f69684d549,"[How, many, numbers, higher, than, a, million,..."
2,8138ae48649e37091a91,"[How, come, I, feel, nothing, for, my, family,..."
3,981b4753d17ef14d09f7,"[In, case, of, collapse, of, the, Democratic, ..."
4,452e2c705276ba16b7b7,"[Who, is, Émile, Naoumoff]"
...,...,...
306117,a352dff4fcc2571815ce,"[Did, anyone, get, an, update, on, Maruti, Suz..."
306118,ad4a8498d97c536c67b9,"[What, people, in, history, do, you, find, the..."
306119,19784a27b55d4b453fda,"[How, can, I, remove, the, tan, on, my, forehead]"
306120,370191dba26465997879,"[If, you, are, a, well, known, hacker, will, y..."


### Lemmatization
Lemmatization is a tool that performs full morphological analysis to more accurately find the root, or “lemma” for a word.

In [17]:
# Lemmatization
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
train_df['question_text']=train_df['question_text'].apply(lambda x:lemmatizer(x))
test_df['question_text']=test_df['question_text'].apply(lambda x:lemmatizer(x))
# train_df.shape
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[What, are, interesting, fact, about, Microsof...",0
1,dc708b74a108d0fc0ad9,"[What, are, those, thing, which, are, not, gon...",0
2,06a27ec5d82dacd8bfe0,"[What, should, I, know, to, avoid, being, upso...",0
3,00cbb6b17e3ceb7c5358,"[How, I, add, any, account, with, payment, bank]",0
4,7c304888973a701585a0,"[Which, Multi, level, marketing, product, are,...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[How, is, CSE, at, VIT, Chennai]",0
999996,e80edbfc086f7125940f,"[How, can, we, prevent, a, holocaust, by, robo...",0
999997,1506dfad6bd340782a1f,"[How, can, I, help, a, student, remember, key,...",0
999998,b56c60fd407f2f85553c,"[What, is, the, difference, between, lace, clo...",0


### Stop Words removal
Stop words are commonly occurring words that for some computational processes provide little information or in some cases introduce unnecessary noise and therefore need to be removed.

In [18]:
stopwords=stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
train_df['question_text']= train_df['question_text'].apply(lambda x:remove_stopwords(x))
test_df['question_text']= test_df['question_text'].apply(lambda x:remove_stopwords(x))

In [19]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[What, interesting, fact, Microsoft, history]",0
1,dc708b74a108d0fc0ad9,"[What, thing, gon, na, happen, ever]",0
2,06a27ec5d82dacd8bfe0,"[What, I, know, avoid, upsold, getting, car, b...",0
3,00cbb6b17e3ceb7c5358,"[How, I, add, account, payment, bank]",0
4,7c304888973a701585a0,"[Which, Multi, level, marketing, product, actu...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[How, CSE, VIT, Chennai]",0
999996,e80edbfc086f7125940f,"[How, prevent, holocaust, robot, AI, alien]",0
999997,1506dfad6bd340782a1f,"[How, I, help, student, remember, key, step, i...",0
999998,b56c60fd407f2f85553c,"[What, difference, lace, closure, lace, frontal]",0


### Creating sentences out of the remaining words 

In [20]:
def makeSentence(text):
    return ' '.join(list(text))

train_df.question_text = train_df['question_text'].apply(lambda x: makeSentence(x))
test_df.question_text = test_df['question_text'].apply(lambda x: makeSentence(x))

print(train_df.question_text)
# print(makeSentence(train_df.question_text[0]))
# ' '.join(list(train_df.question_text)[0])

0                   What interesting fact Microsoft history
1                             What thing gon na happen ever
2         What I know avoid upsold getting car brake cha...
3                            How I add account payment bank
4         Which Multi level marketing product actually w...
                                ...                        
999995                                  How CSE VIT Chennai
999996                 How prevent holocaust robot AI alien
999997    How I help student remember key step informati...
999998            What difference lace closure lace frontal
999999                      What happens look broken mirror
Name: question_text, Length: 1000000, dtype: object


In [21]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period due wedding day How I stop Pill option
1,9914c62ed3f69684d549,How many number higher million formed digit
2,8138ae48649e37091a91,How come I feel nothing family still love pet ...
3,981b4753d17ef14d09f7,In case collapse Democratic party Republican p...
4,452e2c705276ba16b7b7,Who Émile Naoumoff
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get update Maruti Suzuki All India ...
306118,ad4a8498d97c536c67b9,What people history find interesting
306119,19784a27b55d4b453fda,How I remove tan forehead
306120,370191dba26465997879,If well known hacker prone hacked


### Doing spellcheck

In [22]:
!pip install symspellpy

Collecting symspellpy
  Downloading symspellpy-6.7.7-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting editdistpy>=0.1.3
  Downloading editdistpy-0.1.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.5/125.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: editdistpy, symspellpy
Successfully installed editdistpy-0.1.3 symspellpy-6.7.7
[0m

In [23]:
import pkg_resources
from symspellpy import SymSpell, Verbosity
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)


def spellcorrect(text):
#   print(text)
  text_list = str(text).split()
  text_list_spell = []
  for word in text_list:
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    if suggestions:
        text_list_spell.append(suggestions[0].term)
    else:
        text_list_spell.append(word)
  return' '.join(text_list_spell)
train_df['question_text']= train_df['question_text'].apply(lambda x: spellcorrect(x))
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,What interesting fact Microsoft history,0
1,dc708b74a108d0fc0ad9,What thing gon na happen ever,0
2,06a27ec5d82dacd8bfe0,What I know avoid upsold getting car brake cha...,0
3,00cbb6b17e3ceb7c5358,How I add account payment bank,0
4,7c304888973a701585a0,Which Multi level marketing product actually w...,0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,How CSE VIT Chennai,0
999996,e80edbfc086f7125940f,How prevent holocaust robot AI alien,0
999997,1506dfad6bd340782a1f,How I help student remember key step informati...,0
999998,b56c60fd407f2f85553c,What difference lace closure lace frontal,0


### Removing non eng words

In [24]:
import nltk
words = set(nltk.corpus.words.words())
train_df.question_text = train_df.question_text.apply(lambda x: " ".join(w for w in nltk.wordpunct_tokenize(x) \
         if w.lower() in words or not w.isalpha()))

In [25]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,What interesting fact history,0
1,dc708b74a108d0fc0ad9,What thing gon na happen ever,0
2,06a27ec5d82dacd8bfe0,What I know avoid getting car brake,0
3,00cbb6b17e3ceb7c5358,How I add account payment bank,0
4,7c304888973a701585a0,Which level marketing product actually worth,0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,How,0
999996,e80edbfc086f7125940f,How prevent holocaust robot AI alien,0
999997,1506dfad6bd340782a1f,How I help student remember key step informati...,0
999998,b56c60fd407f2f85553c,What difference lace closure lace frontal,0


In [26]:
train_df.to_csv("preprocessed_train.csv",index=False)
test_df.to_csv("preprocessed_test.csv",index=False)

## Reading Preprocessed data

In [27]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV

In [28]:
train_df = pd.read_csv("../input/preprocessed-train-and-test/preprocessed_train.csv")
test_df = pd.read_csv("../input/preprocessed-train-and-test/preprocessed_test.csv")

In [29]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period due wedding day How I stop Pill option
1,9914c62ed3f69684d549,How many number higher million formed digit
2,8138ae48649e37091a91,How come I feel nothing family still love pet ...
3,981b4753d17ef14d09f7,In case collapse Democratic party Republican p...
4,452e2c705276ba16b7b7,Who Émile Naoumoff
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get update Maruti Suzuki All India ...
306118,ad4a8498d97c536c67b9,What people history find interesting
306119,19784a27b55d4b453fda,How I remove tan forehead
306120,370191dba26465997879,If well known hacker prone hacked


## Vectorization of the dataset

In [30]:
tfidf = CountVectorizer(ngram_range=(1,3))
# Numericalize the train dataset
train = tfidf.fit_transform(train_df.question_text.values.astype('U'))

# for test_df
test_df_matrix = tfidf.transform(test_df.question_text.values.astype('U'))
# print(test_df_matrix.shape)
test_df_matrix

<306122x7817595 sparse matrix of type '<class 'numpy.int64'>'
	with 3799907 stored elements in Compressed Sparse Row format>

In [31]:
train

<1000000x7817595 sparse matrix of type '<class 'numpy.int64'>'
	with 19226935 stored elements in Compressed Sparse Row format>

## Doing train test split of the train_df.csv

In [32]:

X_train, X_test, y_train, y_test = train_test_split(
    train, train_df.target.values, test_size=0.10, stratify=train_df.target.values)

X_train.shape
train = X_train
test = X_test

In [33]:
train.shape

(900000, 7817595)

In [34]:
# import pickle

# def storeData(text,obj):
#     # Its important to use binary mode
#     dbfile = open(text+'.pickle', 'wb')
#     # source, destination
#     pickle.dump(obj, dbfile)                     
#     dbfile.close()

In [35]:
# X_train

In [36]:
# storeData('X_train',X_train)
# storeData('X_test',X_test)
# storeData('y_train',y_train)
# storeData('y_test',y_test)
# storeData('test_df_matrix',test_df_matrix)

In [37]:
from sklearn.linear_model import LogisticRegression 
model = LogisticRegression(penalty='l2', max_iter=50000,solver='lbfgs',class_weight='balanced')
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)
print(metrics.classification_report(y_train,y_pred) )

train f1 score:  0.9587386378322491
test f1 score:  0.6243588762152644
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    844317
           1       0.92      1.00      0.96     55683

    accuracy                           0.99    900000
   macro avg       0.96      1.00      0.98    900000
weighted avg       1.00      0.99      0.99    900000



In [38]:
Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
TEST_DF.to_csv("sample_submission_withoutLowerNgrams=1_3_spellcheckandnonengwords.csv",index=False)

In [39]:
# !python -m spacy download en_core_web_sm

In [40]:
# import spacy