# Let's start 

Here's the notebook representing the work done by a group of student who want to use AI and machine learning in distinguishing real/fake news in tweets

In [0]:
import pandas as pd

In [0]:
# Import "fake_or_real_news.csv" from amazonaws website
df = pd.read_csv("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv")
    
# Inspect shape of df
df.shape

# Print first lines of df to see columns of the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [0]:
#Taking each text with it's label
x= df[['text', 'label']]

In [0]:
# Replace  the labels Fake/ Real with binary values 0/1
x["label"] = x["label"].map({"FAKE": 0, "REAL": 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
# Set variables to show the real text 
mask_on = x['label'] == 1
df_real = x[mask_on]['text']

# Instantiate a CountVectorizer
cv1 = CountVectorizer(stop_words = 'english')

# Fit and transform the vectorizer on our corpus
real_cvec = cv1.fit_transform(df_real)

# Convert real_cvec into a DataFrame
real_df = pd.DataFrame(real_cvec.toarray(),
                   columns=cv1.get_feature_names())

# Inspect head of real text cvec
print(real_df.shape)

(3171, 43364)


In [0]:
# Set variables to show fake text
mask_no = x['label'] == 0
df_fake = x[mask_no]['text']

# Instantiate a CountVectorizer
cv2 = CountVectorizer(stop_words = 'english')

# Fit and transform the vectorizer on our corpus
fake_cvec = cv2.fit_transform(df_fake)

# Convert fake_cvec into a DataFrame
fake_df = pd.DataFrame(fake_cvec.toarray(),
                   columns=cv2.get_feature_names())

# Inspect head of fake text cvec
print(fake_df.shape)

(3164, 53064)


In [0]:
#Having a look at the distribution of real vs fake texts
#Baseline score
x['label'].value_counts(normalize=True)

1    0.500552
0    0.499448
Name: label, dtype: float64

In [0]:
#Preparing the data to feed to the machine learning model
XX = x['text']
#Selecting the corresponding labels
y = x['label']

In [0]:
#Preparing the training data and the testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XX,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#The model will consist of a CountVectorizer method to convert text into numeric representation based on term frequency
#followed by Logistic Regression algorithm to classify fake / real texts (news)
pipe = Pipeline([('cvec', CountVectorizer()),    
                 ('lr', LogisticRegression(solver='liblinear'))])

# Tune GridSearchCV
# This method allow the model to better fine tune the model parameters to get the best results 
pipe_params = {'cvec__stop_words': [None, 'english'],
               'cvec__ngram_range': [(1,1), (2,2), (1,3)],
               'lr__C': [0.01, 1]}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train);
print("Best score:", gs.best_score_)
print("Train score", gs.score(X_train, y_train))
print("Test score", gs.score(X_test, y_test))

gs.best_params_

Best score: 0.9176989148167292
Train score 1.0
Test score 0.9242424242424242


{'cvec__ngram_range': (1, 3), 'cvec__stop_words': 'english', 'lr__C': 1}

In [0]:
#Here's an arbitrary example how to use the model on unseen text and performing the classification
if (gs.predict(['Karoui is president USA'])[0]):
  print("REAL")
else:
  print("FAKE")

FAKE


In [0]:
#Now we can save the model to disk and use it another time
import pickle
filename = 'detect.sav'
pickle.dump(gs, open(filename, 'wb'))