## IMPORT LIBRARIES

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

In [4]:
df = pd.read_csv("WELFake_Dataset.csv")

## FIRST 5 DATA

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


## REMOVE UNWANTED COLUMNS

In [6]:
df.drop(columns=['Unnamed: 0'] , axis = 1 , inplace = True)

In [7]:
df.columns

Index(['title', 'text', 'label'], dtype='object')

## DATASET INFO

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


## CHECKING NULL VALUES

In [9]:
df.isnull().sum()

title    558
text      39
label      0
dtype: int64

## REMOVING NULL VALUES

In [10]:
df.dropna(inplace = True)

In [11]:
df.isnull().sum()

title    0
text     0
label    0
dtype: int64

## NO OF ROWS & COLUMNS

In [13]:
df.shape

(71537, 3)

In [14]:
df['label'].value_counts()

1    36509
0    35028
Name: label, dtype: int64

## NLP - IMPORT LIBRARIES

In [15]:
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## NLP - STOPWORDS

In [16]:
stopword = set(stopwords.words('english'))
new_stopword = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

stopword = list(stopword.union(new_stopword))

## NLP - RE | STOPWORDS REMOVAL | TOKENIZATION | LEMMATIZATION

In [19]:
def clean(text):
    
    # lower case
    text = text.lower()
    
    # removing special characters
    text = re.sub(r'[^a-zA-Z]' , ' ' , text)
    
    # removing html tags
    text = re.sub(r'<.*?>' , ' ' , text)
    
    # tokenization
    text = nltk.word_tokenize(text)
    
    # removing stopwords
    text = [word for word in text if word not in stopword]
    
    # removing word with length less than 3
    text = [word for word in text if len(word)>3]
    
    # lemmatization
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    text = [wnl.lemmatize(word) for word in text]
    
    # converting list into string
    return " ".join(text)

In [20]:
df['title'] = df['title'].apply(lambda x: clean(x))
df['text'] = df['text'].apply(lambda y: clean(y))

## CREATING NEW COLUMN

In [21]:
df['content'] = df['title'] + ' ' + df['text']

## SPLITING X & Y DATA

In [22]:
x = df['content']
y = df['label']

## NLP - TFIDFVECTORIZER

In [23]:
vector = TfidfVectorizer()
vector.fit(x)
x = vector.transform(x)

## ML - IMPORT LIBRARIES

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## ML - SPLITING TRAIN & TEST DATSET

In [26]:
x_train , x_test , y_train , y_test = train_test_split(x , y , 
                                                       test_size = 0.2 , 
                                                       stratify = y , 
                                                       random_state = 2 )

## ML - GRIDSEARCHCV [ DECISION TREE CLASSIFIER ]

In [28]:
# decision tree

dt = GridSearchCV(DecisionTreeClassifier(),
                 {
                     'criterion':['gini','entropy','log_loss'],
                     'splitter' : ['best','random']
                 } , 
                 cv = 5)

In [29]:
dt.fit(x_train , y_train)

Traceback (most recent call last):
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 348, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'log_loss'

Traceback (most recent call last):
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 348, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'log_loss'

Tracebac

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'splitter': ['best', 'random']})

In [30]:
dt.best_params_

{'criterion': 'gini', 'splitter': 'best'}

## ML - MODEL TRAINING

In [34]:
dtc = DecisionTreeClassifier( criterion = 'gini' , splitter = 'best')

In [35]:
dtc.fit(x_train , y_train)

DecisionTreeClassifier()

In [36]:
dtc.score(x_test , y_test)

0.9269639362594353

## ML - GRIDSEARCHCV [ LOGISTIC REGRESSION ]

In [31]:
# logistic regression

lr = GridSearchCV(LogisticRegression() , 
                 {
                     'penalty':['l1','l2','elasticnet','None']
                 },
                 cv = 5)

In [32]:
lr.fit(x_train , y_train)

Traceback (most recent call last):
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\linga\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'penalty': ['l1', 'l2', 'elasticnet', 'None']})

In [33]:
lr.best_params_

{'penalty': 'l2'}

# ML - MODEL TRAINING

In [38]:
lrc = LogisticRegression(penalty = 'l2')

In [39]:
lrc.fit(x_train , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [40]:
lrc.score(x_test , y_test)

0.9403131115459883