In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [2]:
sentence = pd.read_csv('./data.csv')

In [3]:
sentence

Unnamed: 0,sentence,label
0,Over 90% of the world's population has access ...,question
1,The Mona Lisa is one of the most famous painti...,question
2,The human genome contains over 3 billion base ...,question
3,The average person spends 6 months of their li...,question
4,Over 90% of the world's trade is conducted thr...,question
...,...,...
190438,Federal courts are solely creatures of the fe...,non-question
190439,More than 440 blocos operate in Rio,non-question
190440,Although they were used only for instructiona...,non-question
190441,"8% of the population, while 68.",non-question


In [4]:
sentence['label'].value_counts()

question        117917
non-question     72526
Name: label, dtype: int64

Check if there are any NULL values

In [5]:
sentence.isnull()

Unnamed: 0,sentence,label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
190438,False,False
190439,False,False
190440,False,False
190441,False,False


In [6]:
sentence.isnull().sum()

sentence    0
label       0
dtype: int64

In [7]:
sentence.shape

(190443, 2)

In [8]:
sentence['label'].unique()

array(['question', 'non-question'], dtype=object)

Label Encoding

In [9]:
label_encoder=preprocessing.LabelEncoder()
sentence['label']=label_encoder.fit_transform(sentence['label'])

In [10]:
sentence['label'].unique()

array([1, 0])

In [11]:
sentence

Unnamed: 0,sentence,label
0,Over 90% of the world's population has access ...,1
1,The Mona Lisa is one of the most famous painti...,1
2,The human genome contains over 3 billion base ...,1
3,The average person spends 6 months of their li...,1
4,Over 90% of the world's trade is conducted thr...,1
...,...,...
190438,Federal courts are solely creatures of the fe...,0
190439,More than 440 blocos operate in Rio,0
190440,Although they were used only for instructiona...,0
190441,"8% of the population, while 68.",0


In [12]:
X=sentence['sentence']
y=sentence['label']

In [13]:
y

0         1
1         1
2         1
3         1
4         1
         ..
190438    0
190439    0
190440    0
190441    0
190442    0
Name: label, Length: 190443, dtype: int32

In [14]:
X

0         Over 90% of the world's population has access ...
1         The Mona Lisa is one of the most famous painti...
2         The human genome contains over 3 billion base ...
3         The average person spends 6 months of their li...
4         Over 90% of the world's trade is conducted thr...
                                ...                        
190438     Federal courts are solely creatures of the fe...
190439                  More than 440 blocos operate in Rio
190440     Although they were used only for instructiona...
190441                      8% of the population, while 68.
190442    The era saw Libya's return to the internationa...
Name: sentence, Length: 190443, dtype: object

Split data to training and testing data

In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [16]:
print(X_train.shape)
print(X_test.shape )
print(y_train.shape )
print(y_test.shape)


(152354,)
(38089,)
(152354,)
(38089,)


Feature Extraction

In [51]:
# transforming the text data to feature vectors that can be used as the input to the logistic regression model

fe = TfidfVectorizer(
    # ngram_range=(3,3), :accuracy on test data = 0.6357741080101866 
    # ngram_range=(2,3), :accuracy on test data =  0.8074509700963532, accuracy on training data : 0.8120364414455807
    ngram_range=(1,3), #:accuracy on training data :  0.959449702666159, accuracy on test data :  0.9549738769723543/ 0.9537136706135629
    # ngram_range=(1,5),  #:accuracy on training data :  0.959449702666159, accuracy on test data :  0.9550001312714957
    # ngram_range=(1,10),#:accuracy on training data :  0.9594431390052115, accuracy on test data : 0.9550526398697787 (taking a lot of time for execution)
    min_df=0.001, 
    max_df=1.0, 
    # stop_words='english', :accuracy on train data = around 71% and on test data = around 63%
    analyzer='word'
)

# Removing stop words from the corpus is decreasing the accuracy of the model...may be because words like "is","are" that carry much more importance in...
# ..world of sentences..for example "Are you going tomorrow?" and "You are going tomorrow."...except for the punctuation mark, all the words have same
# frequency, but the idea of sentence depends on the placement of the word...so removing such stop words can really affect the accuracy of the model



# fe=TfidfVectorizer(min_df=0, stop_words='english',lowercase='True')  :accuracy on train data = around 79% and on test data = around 74%

X_trfeature=fe.fit_transform(X_train)
X_ttfeature=fe.transform(X_test)



In [52]:
print(X_trfeature)

  (0, 639)	0.11415057095628632
  (0, 3795)	0.21363532344120026
  (0, 2401)	0.17908905935285202
  (0, 3713)	0.18381368372950913
  (0, 1688)	0.2105233317270154
  (0, 201)	0.16349188003225987
  (0, 4221)	0.08363159941825277
  (0, 964)	0.21748370693856542
  (0, 4001)	0.36001447807110265
  (0, 4173)	0.0902647568160653
  (0, 585)	0.16451453671962976
  (0, 606)	0.14668438486788676
  (0, 2883)	0.177379666296288
  (0, 4250)	0.1258040831599302
  (0, 635)	0.08169157253390366
  (0, 866)	0.2083373715417566
  (0, 3793)	0.12732071893035246
  (0, 223)	0.12531233525511826
  (0, 2400)	0.04510064400604442
  (0, 1613)	0.39975911370132394
  (0, 3917)	0.17573154963177717
  (0, 3411)	0.10994759199982285
  (0, 4018)	0.21614061305604604
  (0, 4033)	0.14038903328112545
  (0, 2831)	0.34287002430113417
  :	:
  (152352, 1623)	0.5660337209618691
  (152352, 3010)	0.3225740620103107
  (152352, 3969)	0.15297215780102785
  (152352, 3962)	0.11460543736240544
  (152352, 3744)	0.14718304515722594
  (152352, 1672)	0.142768

Training the model(LOGISTIC REGRESSION)

In [53]:
model=LogisticRegression(solver='lbfgs', max_iter=1000)

In [54]:
model.fit(X_trfeature,y_train)

LogisticRegression(max_iter=1000)

In [55]:
prediction=model.predict(X_trfeature)

accuracy=accuracy_score(y_train,prediction)

In [56]:
print('accuracy on training data : ',accuracy)

accuracy on training data :  0.9592462291767856


In [57]:
prediction=model.predict(X_ttfeature)

accuracy=accuracy_score(y_test,prediction)

In [58]:
print('accuracy on test data : ',accuracy)

accuracy on test data :  0.9537136706135629
