Imported all the neccessary dependencies

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split , cross_val_score , KFold , GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import accuracy_score
import re   
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Mukunthan
[nltk_data]     Periyasamy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Loading the dataset

In [3]:
dataset = pd.read_csv('../dataset/Tamil-News-Headlines.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,English Version,Label,News,Author,Date,Authenticity
0,0,Additional charge for users who share password...,tech,பாஸ்வேர்டை பகிரும் பயனர்களிடம் கூடுதல் கட்டணம...,செய்திப்பிரிவு,19-Oct-22,0
1,1,Production of 'AK-203' guns in India by the en...,tech,இந்தியாவில் நடப்பு ஆண்டின் இறுதிக்குள் ‘ஏகே-2...,செய்திப்பிரிவு,18-Oct-22,0
2,2,Moto E22s smartphone launched in India at a bu...,tech,பட்ஜெட் விலையில் மோட்டோ E22s ஸ்மார்ட்போன் இந்...,செய்திப்பிரிவு,17-Oct-22,0
3,3,Let's make Kalam's last dream come true,tech,கலாம் கண்ட கடைசி கனவை நனவாக்குவோம்,செய்திப்பிரிவு,15-Oct-22,0
4,4,Redmi A1+ smartphone launched in India at a bu...,tech,பட்ஜெட் விலையில் ரெட்மி ஏ1+ ஸ்மார்ட்போன் இந்த...,செய்திப்பிரிவு,14-Oct-22,0


checking for null value and dtypes

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5226 entries, 0 to 5225
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       5226 non-null   int64 
 1   English Version  5226 non-null   object
 2   Label            5226 non-null   object
 3   News             5226 non-null   object
 4   Author           5148 non-null   object
 5   Date             5226 non-null   object
 6   Authenticity     5226 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 285.9+ KB


In [5]:
dataset.isnull().sum()  

# there are some null values in the author column but since we are not using that column we can ignore it.    


Unnamed: 0          0
English Version     0
Label               0
News                0
Author             78
Date                0
Authenticity        0
dtype: int64

In [6]:
dataset.count()

Unnamed: 0         5226
English Version    5226
Label              5226
News               5226
Author             5148
Date               5226
Authenticity       5226
dtype: int64

verifying the dataset is balanced

In [7]:
dataset['Authenticity'].value_counts()

Authenticity
1    2902
0    2324
Name: count, dtype: int64

1 --> Fake News

0 --> Real News

Stemming Process :

    Stemming is the process of converting the words to it's root words

In [8]:
porter_stem = PorterStemmer()

def stemming(content):
    stem_content = re.sub('[^a-zA-Z]', ' ', content) 
    stem_content = stem_content.lower()
    stem_content = stem_content.split()
    stem_content = [porter_stem.stem(word) for word in stem_content if word not in stopwords.words('english')]
    stem_content = ' '.join(stem_content)
    return stem_content

In [9]:
dataset['English Version']

0       Additional charge for users who share password...
1       Production of 'AK-203' guns in India by the en...
2       Moto E22s smartphone launched in India at a bu...
3                 Let's make Kalam's last dream come true
4       Redmi A1+ smartphone launched in India at a bu...
                              ...                        
5221    Rumored to be a Corona patient: a laborer who ...
5222    WhatsApp information that causes unrest... the...
5223           'Did the chief minister pay for the vote?'
5224         New Amendment in Information Technology Act!
5225    Right-wing organizations are a threat to the c...
Name: English Version, Length: 5226, dtype: object

Applying stemming to `English Version` feature and storing in a new column `English News`

In [10]:
# Apply stemming to the English Version column and store in a new column
dataset['English News'] = dataset['English Version'].apply(stemming)
dataset[['English Version', 'English News']].head()

Unnamed: 0,English Version,English News
0,Additional charge for users who share password...,addit charg user share password netflix paley ...
1,Production of 'AK-203' guns in India by the en...,product ak gun india end year
2,Moto E22s smartphone launched in India at a bu...,moto e smartphon launch india budget price price
3,Let's make Kalam's last dream come true,let make kalam last dream come true
4,Redmi A1+ smartphone launched in India at a bu...,redmi smartphon launch india budget price price


In [11]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,English Version,Label,News,Author,Date,Authenticity,English News
0,0,Additional charge for users who share password...,tech,பாஸ்வேர்டை பகிரும் பயனர்களிடம் கூடுதல் கட்டணம...,செய்திப்பிரிவு,19-Oct-22,0,addit charg user share password netflix paley ...
1,1,Production of 'AK-203' guns in India by the en...,tech,இந்தியாவில் நடப்பு ஆண்டின் இறுதிக்குள் ‘ஏகே-2...,செய்திப்பிரிவு,18-Oct-22,0,product ak gun india end year
2,2,Moto E22s smartphone launched in India at a bu...,tech,பட்ஜெட் விலையில் மோட்டோ E22s ஸ்மார்ட்போன் இந்...,செய்திப்பிரிவு,17-Oct-22,0,moto e smartphon launch india budget price price
3,3,Let's make Kalam's last dream come true,tech,கலாம் கண்ட கடைசி கனவை நனவாக்குவோம்,செய்திப்பிரிவு,15-Oct-22,0,let make kalam last dream come true
4,4,Redmi A1+ smartphone launched in India at a bu...,tech,பட்ஜெட் விலையில் ரெட்மி ஏ1+ ஸ்மார்ட்போன் இந்த...,செய்திப்பிரிவு,14-Oct-22,0,redmi smartphon launch india budget price price


Feature Selection

In [12]:
X = dataset['English News'].values
y = dataset['Authenticity'].values

print(X)
print(y)

['addit charg user share password netflix paley plan'
 'product ak gun india end year'
 'moto e smartphon launch india budget price price' ...
 'chief minist pay vote' 'new amend inform technolog act'
 'right wing organ threat countri prakashraj']
[0 0 0 ... 1 1 1]


Vectorizing the `English News` feature

In [13]:
encoder = TfidfVectorizer()


Hyper Parameter Tuning

In [14]:
model_params = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'model__C': [0.01, 0.1, 1, 10, 100],
            'model__solver': ['liblinear', 'saga']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    },
    'Support Vector Machine': {
        'model': SVC(),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf']
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance']
        }
    }
}

# We'll collect results for each model so we can build a single DataFrame afterwards
results = []
for name, mp in model_params.items():
    pipeline = Pipeline([('tfidf', encoder), ('model', mp['model'])])
    clf = GridSearchCV(pipeline, mp['params'], cv=5, scoring='accuracy', n_jobs=-1)
    clf.fit(X, y)
    # store model name, best params and best score
    results.append((name, clf.best_params_, clf.best_score_))
    print(f"{name} Best Parameters: {clf.best_params_}")
    print(f"{name} Best Cross-Validation Accuracy: {clf.best_score_:.4f}")

Logistic Regression Best Parameters: {'model__C': 10, 'model__solver': 'liblinear'}
Logistic Regression Best Cross-Validation Accuracy: 0.8337
Decision Tree Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 5}
Decision Tree Best Cross-Validation Accuracy: 0.7920
Decision Tree Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 5}
Decision Tree Best Cross-Validation Accuracy: 0.7920
Random Forest Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Random Forest Best Cross-Validation Accuracy: 0.8393
Random Forest Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Random Forest Best Cross-Validation Accuracy: 0.8393
Support Vector Machine Best Parameters: {'model__C': 1, 'model__kernel': 'linear'}
Support Vector Machine Best Cross-Validation Accuracy: 0.8330
Support Vector Machine Best Parameters: {'model__C': 1, 'model__kernel': 'linear'}
Suppo

In [15]:
# Create DataFrame with all model results
model_params_df = pd.DataFrame(results, columns=['Model', 'Best Parameters', 'Best Cross-Validation Accuracy'])

# Display the results
model_params_df

Unnamed: 0,Model,Best Parameters,Best Cross-Validation Accuracy
0,Logistic Regression,"{'model__C': 10, 'model__solver': 'liblinear'}",0.833715
1,Decision Tree,"{'model__max_depth': None, 'model__min_samples...",0.792004
2,Random Forest,"{'model__max_depth': None, 'model__min_samples...",0.839264
3,Support Vector Machine,"{'model__C': 1, 'model__kernel': 'linear'}",0.832951
4,K-Nearest Neighbors,"{'model__n_neighbors': 7, 'model__weights': 'd...",0.77746


From this we can see that the `Random Forest model` performed best with an accuracy of ~84.25%, 

followed by `Logistic Regression` and `Support Vector Machine`, both around ~83.3%. The` Decision 

Tree` and `K-Nearest Neighbors` models had lower performance at ~79.12% and ~77.75% respectively.

Creating a Pipeline

Splitting data into training data and test data

In [16]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(X_train.shape , X_test.shape)
print(y_train.shape , y_test.shape)

(4180,) (1046,)
(4180,) (1046,)


Training Model

In [17]:
model_params_df[['Model','Best Parameters']]

Unnamed: 0,Model,Best Parameters
0,Logistic Regression,"{'model__C': 10, 'model__solver': 'liblinear'}"
1,Decision Tree,"{'model__max_depth': None, 'model__min_samples..."
2,Random Forest,"{'model__max_depth': None, 'model__min_samples..."
3,Support Vector Machine,"{'model__C': 1, 'model__kernel': 'linear'}"
4,K-Nearest Neighbors,"{'model__n_neighbors': 7, 'model__weights': 'd..."


In [18]:
model_params_df['Best Parameters'][2]

{'model__max_depth': None,
 'model__min_samples_split': 2,
 'model__n_estimators': 200}

In [19]:
model_pipeline = Pipeline([
    ('encoder',encoder),
    ('model',RandomForestClassifier(max_depth=None, min_samples_split=5, n_estimators=50))
])

In [20]:
model_pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('encoder', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
X_train_prediction = model_pipeline.predict(X_train)

In [22]:
X_train_accuracy = accuracy_score(X_train_prediction,y_train)

print("Training Accuracy :",X_train_accuracy)

Training Accuracy : 0.9978468899521531


In [23]:
X_test_prediction = model_pipeline.predict(X_test)

X_test_accuracy = accuracy_score(X_test_prediction,y_test)

print("Testing Accuracy :",X_test_accuracy)

Testing Accuracy : 0.887189292543021


Training Data Accuracy : 99 %

Testing Data Accuracy : 89 %

In [24]:
classification_report = classification_report(y_test,X_test_prediction)
print(classification_report)

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       482
           1       0.93      0.86      0.89       564

    accuracy                           0.89      1046
   macro avg       0.89      0.89      0.89      1046
weighted avg       0.89      0.89      0.89      1046



Building a Prediction Model for Testing

In [25]:
def fake_news_predtion(news):
    news = stemming(news)
    prediction = model_pipeline.predict([news])

    if prediction[0] == 1:
        print("This is an Fake News")
    else:
        print("This is an Real News")

In [29]:
print(dataset['English Version'][222] , dataset['Authenticity'][222])

Document Sharing | WhatsApp is likely to launch a new feature soon 0


In [27]:
fake_news_predtion("Document Sharing | WhatsApp is likely to launch a new feature soon")

This is an Real News


In [33]:
import pickle

model_path = 'model/model.pkl'
# model = your trained pipeline (can be classifier, vectorizer+classifier, etc.)
with open(model_path, 'wb') as f:
    pickle.dump(model_pipeline, f)
