In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from platform import python_version
'Python ' + python_version()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
fake = pd.read_csv('/content/sample_data/fake.csv')
true = pd.read_csv('/content/sample_data/true.csv')

#Create a new column with their fake values
fake['fake'] = 1.0
true['fake'] = 0.0

#Drop null values
fake = fake.dropna()
true = true.dropna()

#Drop duplicates values
true = true.drop_duplicates()
fake = fake.drop_duplicates()

#Balance the data
fake = fake[:20000]
true = true[:20000]

#Unify the fake data with true data
dTF = pd.concat([fake,true],ignore_index=True)

#Drop null values again
dTF = dTF.dropna()

dTF.shape

#Drop useless columns
dTF = dTF.drop(['date',], axis=1)
dTF = dTF.drop(['subject',], axis=1)

dTF.info()

(40000, 5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   40000 non-null  object 
 1   text    40000 non-null  object 
 2   fake    40000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 937.6+ KB


In [None]:
# Let us look at quantile based distribution of story lengths
dTF[dTF.fake == 0.0]['text'].apply(lambda row: len(row.split())).quantile([0.25,0.5,0.75,0.90,0.95])

dTF[dTF.fake == 1.0]['text'].apply(lambda row: len(row.split())).quantile([0.25,0.5,0.75,0.90,0.95])

0.25     85.0
0.50    211.0
0.75    306.0
0.90    454.0
0.95    527.0
Name: text, dtype: float64

0.25    136.0
0.50    195.0
0.75    265.0
0.90    357.0
0.95    441.0
Name: text, dtype: float64

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


# Cleaning the text data and to lower case
def lowercase(text):
    text=text.lower()
    return text

def text_clean(text):
    text=re.sub("\[.*#&]"," ",text)
    text=re.sub("https?://\S+|www\.\S+","",text)
    text=re.sub("[^a-zA-Z0-9\s]+","",text)
    text = re.sub('\w*\d\w*', '', text)
    stop_words = set(stopwords.words('english'))
    words = text.split()


    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)

    # Remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()


    return text

dTF['text']=dTF['text'].apply(lowercase)
dTF['text']=dTF['text'].apply(text_clean)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.stem import SnowballStemmer

# SnowballStemmer
stemmer = SnowballStemmer('english')

def stem_text(text):
    # Tokenize the input text into individual words
    tokens = nltk.word_tokenize(text)

    # Stem each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Join the stemmed tokens back into a single string
    return ' '.join(stemmed_tokens)

# Stemming text
dTF['text']=dTF['text'].apply(stem_text)

dTF.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Unnamed: 0,title,text,fake
0,Donald Trump Sends Out Embarrassing New Year’...,donald trump wish american happi new year leav...,1.0
1,Drunk Bragging Trump Staffer Started Russian ...,hous intellig committe chairman devin nune go ...,1.0
2,Sheriff David Clarke Becomes An Internet Joke...,friday reveal former milwauke sheriff david cl...,1.0
3,Trump Is So Obsessed He Even Has Obama’s Name...,christma day donald trump announc would back w...,1.0
4,Pope Francis Just Called Out Donald Trump Dur...,pope franci use annual christma day messag reb...,1.0


In [None]:
# Learning setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the text data
X = vectorizer.fit_transform(dTF['text'])

#Prepare the data Stage: Training, Developing and Testing
y=dTF['fake']

X_train, X_devtest, y_train, y_devtest = train_test_split(X, y, test_size=0.2, random_state=42)

X_trainStage, X_trainTestStage, y_trainStage, y_trainTestStage = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_dev,X_test,y_dev,y_test = train_test_split(X_devtest, y_devtest, test_size=0.5, random_state=42)



#Data distribution
print("Data for every Stage: ")
print("Data for training",X_train.shape)
print("Data for dev",X_dev.shape)
print("Data for testing",X_test.shape)

#Distribution of data on the stages
print("Data for used in the stage :")
print("Data" ,X_train.shape, "for training stage ",X_trainStage.shape,X_trainTestStage.shape)



Data for every Stage: 
Data for training (32000, 158267)
Data for dev (4000, 158267)
Data for testing (4000, 158267)
Data for used in the stage :
Data (32000, 158267) for training stage  (25600, 158267) (6400, 158267)


In [None]:
from sklearn.neural_network import MLPClassifier
MLPC_model = MLPClassifier(activation="relu",solver="sgd",alpha=0.0,batch_size=1,learning_rate="constant",learning_rate_init=0.001,power_t=0.0,max_iter=100,momentum=0.0,nesterovs_momentum=False,validation_fraction=0.0)


In [None]:
#Test training
MLPC_model.fit(X_trainStage, y_trainStage)

MLPC_model.score(X_trainTestStage,y_trainTestStage)



0.489375

In [None]:
MLPC_model.get_params()

{'activation': 'relu',
 'alpha': 0.0,
 'batch_size': 1,
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 100,
 'momentum': 0.0,
 'n_iter_no_change': 10,
 'nesterovs_momentum': False,
 'power_t': 0.0,
 'random_state': None,
 'shuffle': True,
 'solver': 'sgd',
 'tol': 0.0001,
 'validation_fraction': 0.0,
 'verbose': False,
 'warm_start': False}

In [None]:
#Optimization - modify hidden layers size - batch auto min(200, n_samples) - Momentum to 0.9 - nesterovs momentum enable
MLPC_model = MLPClassifier(hidden_layer_sizes=(100,3),activation="relu",solver="sgd",alpha=0.0,batch_size="auto",learning_rate="constant",learning_rate_init=0.001,power_t=0.0,max_iter=100,momentum=0.9,nesterovs_momentum=True,validation_fraction=0.0)

In [None]:
#Test Training post optimization
MLPC_modelTrain=MLPC_model.fit(X_trainStage, y_trainStage)
MLPC_model
MLPC_modelTrain.score(X_trainTestStage,y_trainTestStage)



0.9540625

In [None]:
#Test dev
MLPC_modelTrain.score(X_dev, y_dev)

0.9555

In [None]:
#Test Training
MLPC_model.score(X_test, y_test)

0.959