In [1]:
# Data Wrangling
import numpy as np
import pandas as pd
import polars as pl

# Data Visualization
import cufflinks as cf
import seaborn as sns
import matplotlib.pyplot as plt 

# Reducción de dimensionalidad
from varclushi import VarClusHi
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
from sklearn.model_selection import train_test_split, cross_val_score

# Text mining
import re
import emoji # pip install emoji
import unicodedata
from bs4 import BeautifulSoup # pip install bs4
import nltk

# Preprocesamiento
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, mean_absolute_error, r2_score, mean_squared_error, confusion_matrix, classification_report

# Environment setup
import PyPDF2
cf.set_config_file(offline=True)
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df_train = pd.read_csv('fake_news_train.csv') 
df_test = pd.read_csv('fake_news_test.csv') 

In [3]:
df = df_train.copy()

In [4]:
df.shape

(35918, 5)

In [5]:
df.fake_not.value_counts(1)

fake_not
1    0.522802
0    0.477198
Name: proportion, dtype: float64

In [6]:
df.sample(5)

Unnamed: 0,title,text,subject,date,fake_not
8796,Kremlin says Donald Trump pre-election clip de...,MOSCOW (Reuters) - The Kremlin on Thursday con...,politicsNews,"March 17, 2016",0
20344,U.S. Interior chief recommends shrinking Utah'...,WASHINGTON (Reuters) - U.S. Interior Secretary...,politicsNews,"June 12, 2017",0
23336,LEFTY STUNT BACKFIRES: Inflatable “Trump Chick...,It s so funny that a lefty effort to harm Pres...,left-news,"Aug 10, 2017",1
29968,Few tears in China as old friend Mugabe ousted...,BEIJING (Reuters) - China is shedding few tear...,worldnews,"November 22, 2017",0
12746,"After Trump victory, Obama and Merkel stress b...","BERLIN (Reuters) - German and U.S. employers, ...",politicsNews,"November 16, 2016",0


In [7]:
X = df.drop('fake_not',axis=1) 
y = df['fake_not']

In [8]:
df.subject.value_counts()

subject
politicsNews       9047
worldnews          8093
News               7271
politics           5451
left-news          3585
Government News    1254
US_News             613
Middle-east         604
Name: count, dtype: int64

In [9]:
def clean_words(text):
    res =  unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    res = re.sub("[^a-zA-Z0-9 ]"," ", res.decode("utf-8"), flags=re.UNICODE)
    res =  u' '.join(res.lower().split())
    return res
    

In [10]:
df['title_c']=df['title'].map(clean_words)
df['text_c']=df['text'].map(clean_words)

In [11]:
df.sample(10)

Unnamed: 0,title,text,subject,date,fake_not,title_c,text_c
29474,First Supreme Court 4-4 Tie Screws Republican...,Republicans were handed a harsh reminder of ho...,News,"March 30, 2016",1,first supreme court 4 4 tie screws republicans...,republicans were handed a harsh reminder of ho...
24638,Instant View: Senate plan delays corporate tax...,(Reuters) - U.S. Senate Republicans’ version o...,politicsNews,"November 9, 2017",0,instant view senate plan delays corporate tax ...,reuters u s senate republicans version of a ta...
30128,"Australia to discuss North Korea, Islamist ter...",SYDNEY (Reuters) - Australia s Prime Minister ...,worldnews,"November 12, 2017",0,australia to discuss north korea islamist terr...,sydney reuters australia s prime minister malc...
13074,Oregon Congressman Rips Justice Department Fo...,Oregon Rep. Peter DeFazio took federal authori...,News,"January 16, 2016",1,oregon congressman rips justice department for...,oregon rep peter defazio took federal authorit...
30144,Spain's constitutional court suspends Catalan ...,MADRID (Reuters) - The Spanish Constitutional ...,worldnews,"September 7, 2017",0,spain s constitutional court suspends catalan ...,madrid reuters the spanish constitutional cour...
31859,OBAMA UNDERMINES AMERICA…Plans To Slash Nuclea...,Dinesh D Sousa warned us about Obama s reducti...,politics,"Apr 4, 2016",1,obama undermines americaplans to slash nuclear...,dinesh d sousa warned us about obama s reducti...
8118,4 Insanely Great Ways One Idea From Obama Jus...,The New York Times has done a major new study ...,News,"April 18, 2016",1,4 insanely great ways one idea from obama just...,the new york times has done a major new study ...
14448,Australian police seize record A$1 billion met...,SYDNEY (Reuters) - Australian police said on F...,worldnews,"December 22, 2017",0,australian police seize record a 1 billion met...,sydney reuters australian police said on frida...
17244,Trump's funding request for U.S. border wall h...,WASHINGTON (Reuters) - President Donald Trump’...,politicsNews,"March 28, 2017",0,trump s funding request for u s border wall hi...,washington reuters president donald trumps cal...
5131,Uganda in anti-online pornography drive seen b...,KAMPALA (Reuters) - Uganda is launching a driv...,worldnews,"September 7, 2017",0,uganda in anti online pornography drive seen b...,kampala reuters uganda is launching a drive ag...


In [12]:
from fuzzywuzzy import fuzz 

df['token_set_ratio'] = [fuzz.token_set_ratio(df['title_c'][i],df['text_c'][i] ) for i in range(len(df))]
df['fw_ratio'] = [fuzz.ratio(df['title_c'][i],df['text_c'][i] ) for i in range(len(df))]
df['fw_partial_ratio'] = [fuzz.partial_ratio(df['title_c'][i],df['text_c'][i] ) for i in range(len(df))]
df['fw_partial_ratio2'] = [fuzz.partial_ratio(df['text_c'][i],df['title_c'][i] ) for i in range(len(df))]

df

Unnamed: 0,title,text,subject,date,fake_not,title_c,text_c,token_set_ratio,fw_ratio,fw_partial_ratio,fw_partial_ratio2
0,Barca's Valverde skirts independence debate as...,"MADRID, (Reuters) - Barcelona boss Ernesto Val...",worldnews,"October 27, 2017",0,barca s valverde skirts independence debate as...,madrid reuters barcelona boss ernesto valverde...,90,0,25,25
1,Trump says Brexit wouldn't impact potential UK...,LONDON (Reuters) - Presumptive U.S. Republican...,politicsNews,"May 15, 2016",0,trump says brexit wouldn t impact potential uk...,london reuters presumptive u s republican pres...,75,0,35,35
2,Mark Zuckerberg’s PERFECT Facebook Comment To...,There are seldom people in life that you can h...,News,"January 4, 2016",1,mark zuckerbergs perfect facebook comment to g...,there are seldom people in life that you can h...,71,2,47,47
3,Suspected U.S. drone targets Haqqani militants...,"DERA ISMAIL KHAN, Pakistan (Reuters) - A suspe...",worldnews,"November 30, 2017",0,suspected u s drone targets haqqani militants ...,dera ismail khan pakistan reuters a suspected ...,100,2,53,53
4,U.S. special envoy encouraged that Kurds could...,"ERBIL (Reuters) - Brett McGurk, the U.S. speci...",worldnews,"September 14, 2017",0,u s special envoy encouraged that kurds could ...,erbil reuters brett mcgurk the u s special env...,96,21,62,62
...,...,...,...,...,...,...,...,...,...,...,...
35913,Turkey not in direct talks for return of intel...,ANKARA (Reuters) - A Turkish minister appeared...,worldnews,"September 19, 2017",0,turkey not in direct talks for return of intel...,ankara reuters a turkish minister appeared to ...,92,0,44,44
35914,Norway's Liberal Party seeks to join government,OSLO (Reuters) - Norway s Liberal Party will l...,worldnews,"December 9, 2017",0,norway s liberal party seeks to join government,oslo reuters norway s liberal party will launc...,93,5,57,57
35915,"Trump could lose a few pounds, but otherwise h...",(Reuters) - Donald Trump knows he could stand ...,politicsNews,"September 14, 2016",0,trump could lose a few pounds but otherwise he...,reuters donald trump knows he could stand to l...,93,4,64,64
35916,Syrian rebel groups reject Russian-sponsored S...,AMMAN (Reuters) - Syrian rebel groups on Monda...,worldnews,"December 25, 2017",0,syrian rebel groups reject russian sponsored s...,amman reuters syrian rebel groups on monday re...,84,1,74,74


In [18]:
df['n_caracteres']=df["text"].str.len()
df["n_dots"] = df["text"].map(lambda x: x.count('.'))

df["n_emojis"] = df["text"].map(emoji.emoji_count)
df["len_text_raw"] = df["text"].str.len()
df["n_words"] = df["text"].str.split(" ").str.len()
df["n_email"] = df["text"].map(lambda x: re.findall(r'[a-z0-9\._-]+@[a-z0-9\._-]+', x.lower())).str.len()

#df["n_urls"] = df["text"].map(lambda x:re.findall("http://", df["text"]) + re.findall("https://", df["text"]),axis=1).str.len()
df["n_lower_ratio_len"] = df["text"].map(lambda x:sum(map(str.islower, x))) / df["n_caracteres"]
df["n_upper_ratio_len"] = df["text"].map(lambda x:sum(map(str.isupper, x))) / df["n_caracteres"]
df["n_letters"] = df["text"].map(lambda x:sum(map(str.isalpha, x)))
df["n_lower_ratio_letters"] = df["text"].map(lambda x:sum(map(str.islower, x))) / df["n_letters"]
df["n_upper_ratio_letters"] = df["text"].map(lambda x:sum(map(str.isupper, x))) / df["n_letters"]
df["n_emojis"] = df["text"].map(emoji.emoji_count)
df["n_phone"] = df["text"].map(lambda x:re.findall(r'\+?(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', x)).str.len()

#My own variables
df["is_question"] = df["text"].str.contains('?', regex=False).astype(int)
df['n_images'] = df["text"].str.contains("imagen omitid", case=False, na=False).astype(int)
df["n_audios"] = df["text"].str.contains("audio omitido", case=False, na=False).astype(int)
df["n_stickers"] = df.apply(lambda x:re.findall('esticker omitido', x['text']), axis = 1).str.len()

dict = {'hola': 1,'gracias':1, 'buenos dias': 1, 'buenas tardes': 1, 'buenas noches': 1}
dict_g = {'wey': 1,'pinche':1, 'cabron': 1, 'puto': 1, 'buenas noches': 1}
dict_ja = {'jaja': 1,'jajaja':1, 'jajaj': 1, 'jjajaja': 1, 'jaj': 1}
dict_no = {'no': 1,'nanais':1, 'nel': 1, 'niwas': 1, 'non': 1}
dict_yes = {'si': 1,'sisi':1, 'que bien!': 1, 'ahuevo': 1, 'felicidades': 1}

'''
X_train['is_polite'] = X_train['message'].str.extract('('+'|'.join(dict).lower()+')',expand=False).map(dict)
X_train['is_polite'] = X_train['is_polite'].fillna(0).astype(int) 
X_train['is_rude'] = X_train['message'].str.extract('('+'|'.join(dict_g).lower()+')',expand=False).map(dict_g)
X_train['is_rude'] = X_train['is_rude'].fillna(0).astype(int) 
X_train['is_hilarious'] = X_train['message'].str.extract('('+'|'.join(dict_ja).lower()+')',expand=False).map(dict_ja)
X_train['is_hilarious'] = X_train['is_hilarious'].fillna(0).astype(int) 
X_train['is_negative'] = X_train['message'].str.extract('('+'|'.join(dict_no).lower()+')',expand=False).map(dict_no)
X_train['is_negative'] = X_train['is_negative'].fillna(0).astype(int) 
X_train['is_positive'] = X_train['message'].str.extract('('+'|'.join(dict_yes).lower()+')',expand=False).map(dict_yes)
X_train['is_positive'] = X_train['is_positive'].fillna(0).astype(int) 
'''

"\nX_train['is_polite'] = X_train['message'].str.extract('('+'|'.join(dict).lower()+')',expand=False).map(dict)\nX_train['is_polite'] = X_train['is_polite'].fillna(0).astype(int) \nX_train['is_rude'] = X_train['message'].str.extract('('+'|'.join(dict_g).lower()+')',expand=False).map(dict_g)\nX_train['is_rude'] = X_train['is_rude'].fillna(0).astype(int) \nX_train['is_hilarious'] = X_train['message'].str.extract('('+'|'.join(dict_ja).lower()+')',expand=False).map(dict_ja)\nX_train['is_hilarious'] = X_train['is_hilarious'].fillna(0).astype(int) \nX_train['is_negative'] = X_train['message'].str.extract('('+'|'.join(dict_no).lower()+')',expand=False).map(dict_no)\nX_train['is_negative'] = X_train['is_negative'].fillna(0).astype(int) \nX_train['is_positive'] = X_train['message'].str.extract('('+'|'.join(dict_yes).lower()+')',expand=False).map(dict_yes)\nX_train['is_positive'] = X_train['is_positive'].fillna(0).astype(int) \n"

In [25]:
X

Unnamed: 0,title,text,subject,date,n_emojis,len_text_raw,n_words,n_email,is_question,n_images,n_audios,n_stickers
0,Barca's Valverde skirts independence debate as...,"MADRID, (Reuters) - Barcelona boss Ernesto Val...",worldnews,"October 27, 2017",0,1993,377,0,0,0,0,0
1,Trump says Brexit wouldn't impact potential UK...,LONDON (Reuters) - Presumptive U.S. Republican...,politicsNews,"May 15, 2016",0,744,136,0,0,0,0,0
2,Mark Zuckerberg’s PERFECT Facebook Comment To...,There are seldom people in life that you can h...,News,"January 4, 2016",0,2343,378,0,0,0,0,0
3,Suspected U.S. drone targets Haqqani militants...,"DERA ISMAIL KHAN, Pakistan (Reuters) - A suspe...",worldnews,"November 30, 2017",0,1806,291,0,0,0,0,0
4,U.S. special envoy encouraged that Kurds could...,"ERBIL (Reuters) - Brett McGurk, the U.S. speci...",worldnews,"September 14, 2017",0,432,76,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
35913,Turkey not in direct talks for return of intel...,ANKARA (Reuters) - A Turkish minister appeared...,worldnews,"September 19, 2017",0,1322,222,0,0,0,0,0
35914,Norway's Liberal Party seeks to join government,OSLO (Reuters) - Norway s Liberal Party will l...,worldnews,"December 9, 2017",0,1115,183,0,0,0,0,0
35915,"Trump could lose a few pounds, but otherwise h...",(Reuters) - Donald Trump knows he could stand ...,politicsNews,"September 14, 2016",0,1888,321,0,1,0,0,0
35916,Syrian rebel groups reject Russian-sponsored S...,AMMAN (Reuters) - Syrian rebel groups on Monda...,worldnews,"December 25, 2017",0,1706,288,0,0,0,0,0


In [None]:
from sklearn. preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(data_train.iloc[:,1])
print("Clases categóricas: " label_encoder.classes_)
integer_classes = label_encoder.transform(label_encoder.classes_)
print("Clases numéricas: " ‚integer_classes)
t = label_encoder.transform(data_train.iloc[:‚1])
data_train.iloc[:,1] = t
data_train.head()

In [19]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(X.iloc[:,1])
print("Clases categóricas: " , label_encoder.classes_)

Clases categóricas:  [' ' '  '
 '    (Welcome) to America We hope you enjoy our benefits. Please feel free to NOT assimilate  Coming soon: Press 1 for English, 2 for Spanish and 3 for Arabic The Obama Administration is on pace to issue more than a million green cards to migrants from majority-Muslim countries, according to an analysis of Department of Homeland Security data.A chart released by the Senate Subcommittee on Immigration and the National Interest Friday details the surge in immigration to the U.S. from majority-Muslim countries since President Barack Obama took office in 2009.Specifically, in the first six fiscal years of Obama s presidency (FY2009   FY2014), his administration issued 832,014 green cards to migrants majority-Muslim countries, the most of which were issued to migrants from Pakistan (102,000), Iraq (102,000), Bangladesh (90,000), Iran (85,000), Egypt (56,000), and Somalia (37,000).The total 832,014 new permanent residents do not include migrants on temporary, 

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)


X_train: (28734, 4)
X_test: (7184, 4)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

logits = LogisticRegression()

logits.fit(X_train, y_train)

ValueError: could not convert string to float: "Iran's Khamenei says Moscow, Tehran cooperation can isolate U.S.: TV"

In [None]:
y_pred = logits.predict(X_test )

In [None]:
predicts=\
pd.concat([pd.Series(y_test).reset_index(drop=True),pd.Series(y_pred)], axis=1).reset_index().drop(columns='index').rename(columns={'is_canceled':'y_test',0:'y_pred'})

In [None]:
predicts.sample(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
 print(classification_report(y_test, y_pred))

In [None]:
#Logit 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

logits = LogisticRegression()

logits.fit(X_train, y_train)

y_pred = logits.predict(X_test )

 print(classification_report(y_test, y_pred))

In [None]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(random_state=123)
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test )

print(classification_report(y_test, y_pred_dtc))

print(f'AUC Score: {roc_auc_score(y_test, y_pred)}')
print(f'AUC Score train: {roc_auc_score(y_score=logits.predict_proba(X_train)[:,1],y_true=y_train )}')
print(f'AUC Score test: {roc_auc_score(y_score=logits.predict_proba(X_test)[:,1],y_true=y_test )}')


In [None]:
#Random Forest 
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=123)

rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test )

print(classification_report(y_test, y_pred_rfc))

In [None]:
#KGBoost 
import xgboost as xgb

xgboost = xgb.XGBClassifier(random_state=123)
xgboost.fit(X_train, y_train)


y_pred_xgb = xgboost.predict(X_test)

print(classification_report(y_test, y_pred_xgb))