# ***Loading the data into data frames***

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
#from gensim.models import Word2Vec
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


nltk.download('punkt')
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))
le = preprocessing.LabelEncoder()

true_df = pd.read_csv('./True.csv')
fake_df = pd.read_csv('./Fake.csv')

true_df['fake'] = 0
fake_df['fake'] = 1
data = pd.concat([true_df,fake_df])
data = data.sample(frac=1).reset_index(drop=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data.head(30)

Unnamed: 0,title,text,subject,date,fake
0,Anti-Trump Protester’s X-Rated Comment Angers ...,This is rich! A live feed from MSNBC was tryin...,left-news,"May 28, 2016",1
1,"Tillerson, on first day, addresses dissent and...",WASHINGTON (Reuters) - U.S. Secretary of State...,politicsNews,"February 2, 2017",0
2,New Mexico Is Erasing Actual Science From The...,"Education standards, it seems, is now a term...",News,"September 29, 2017",1
3,China warns against attempts to contain Beijin...,WASHINGTON (Reuters) - China s ambassador to W...,worldnews,"October 30, 2017",0
4,U.S. to issue rules to tighten individual trav...,WASHINGTON (Reuters) - The Trump administratio...,politicsNews,"June 16, 2017",0
5,Congressman seeks probe of chartered flights b...,NEW YORK/WASHINGTON (Reuters) - A Democratic U...,politicsNews,"October 12, 2017",0
6,FORMER DEMOCRAT WARNS Young Americans: “Rioter...,"Who is silencing political speech, physically...",politics,"Mar 10, 2017",1
7,Car bomber kills at least 15 in Yemeni port Ad...,ADEN (Reuters) - A suicide car bomber blew him...,worldnews,"November 5, 2017",0
8,U.S. Interior Secretary investigated over spee...,WASHINGTON (Reuters) - The U.S. Office of Spec...,politicsNews,"October 3, 2017",0
9,Clinton: Senate should consider Obama's high c...,WASHINGTON (Reuters) - Democratic presidential...,politicsNews,"March 16, 2016",0


In [None]:
df = data

# ***Data Pre-processing***

Steps for pre processing


1.   Breakign the sentences into lowercase words using the RegexTokenizer
2.   Stop words removal
3. applying the count vectorizer on the data to get the count of each word in a sentence
4. Encoding the data using the TF-IDF technique (inverse frequency encoding)



In [None]:
def removePunc(stringList):
  return [e.lower() for e in stringList if e.isalnum()]

def removeStopWords(stringList):
  return [e for e in stringList if e not in stopWords]

def lowerSrtings(stringList):
  return [e.lower() for e in stringList]

'''def encodeData(stringList):
  return [w2vModel[e] for e in stringList]

def singleVector(vectorList):
  return [np.mean(np.array(e)) for e in vectorList]'''

'def encodeData(stringList):\n  return [w2vModel[e] for e in stringList]\n\ndef singleVector(vectorList):\n  return [np.mean(np.array(e)) for e in vectorList]'

In [None]:
df['title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
df['text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

df['text'] = df.apply(lambda row: lowerSrtings(row['text']), axis=1)
df['title'] = df.apply(lambda row: lowerSrtings(row['title']), axis=1)

df['text'] = df.apply(lambda row: removeStopWords(row['text']), axis=1)
df['title'] = df.apply(lambda row: removeStopWords(row['title']), axis=1)

df['text'] = df.apply(lambda row: removePunc(row['text']), axis=1)
df['title'] = df.apply(lambda row: removePunc(row['title']), axis=1)



'''tokenized_texts = list(df['text']) + list(df['title'])
w2vModel = Word2Vec(tokenized_texts,min_count=1)

df['text'] = df.apply(lambda row: encodeData(row['text']), axis=1)
df['title'] = df.apply(lambda row: encodeData(row['title']), axis=1)

df['text'] = df.apply(lambda row: singleVector(row['text']), axis=1)
df['title'] = df.apply(lambda row: singleVector(row['title']), axis=1)
'''
le.fit(list(df['subject']))
dct = {}
for i,j in enumerate(list(le.classes_)):
  dct[j] = i

df['subject'] = df.apply(lambda row: dct[row['subject']], axis=1)

In [None]:
df = df.drop(columns='date')

In [None]:
df1 = df

In [None]:
df.head()

Unnamed: 0,title,text,subject,fake
0,"[protester, comment, angers, msnbc, anchor, gr...","[rich, live, feed, msnbc, trying, show, great,...",4,1
1,"[tillerson, first, day, addresses, dissent, ca...","[washington, reuters, secretary, state, rex, t...",6,0
2,"[new, mexico, erasing, actual, science, scienc...","[education, standards, seems, term, fraught, i...",2,1
3,"[china, warns, attempts, contain, beijing, tru...","[washington, reuters, china, ambassador, washi...",7,0
4,"[issue, rules, tighten, individual, travel, re...","[washington, reuters, trump, administration, p...",6,0


In [None]:
l=list(df['text'])  
texts = [' '.join(i) for i in l]

ll = list(df['title'])
titles = [' '.join(i) for i in ll]

In [None]:
vectorizer_title = TfidfVectorizer(min_df=4)
X_title = vectorizer_title.fit_transform(titles)

In [None]:
svd_n = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X_title_reduced = svd_n.fit_transform(X_title)

In [None]:
vectorizer = TfidfVectorizer(min_df=4)
X_text = vectorizer.fit_transform(texts)

In [None]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
X_text_reduced = svd.fit_transform(X_text)

In [None]:
X_subject = np.array(df['subject'])
Y = np.array(df['fake'])

In [None]:
X_transformed = []

for i,j,k in zip(X_title_reduced,X_text_reduced,X_subject):
  g = np.append(i,j)
  g = np.append(g,k)
  X_transformed.append(g)
  #print(len(g))
  #print(g)
  #break

In [None]:
X_transformed = np.array(X_transformed)

In [None]:
X_transformed[0]

array([ 1.75265198e-02, -1.47647539e-02,  3.51626418e-02, -2.38295631e-02,
       -2.56282879e-03, -3.81993574e-02, -3.33043786e-02,  4.00000000e+00])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, Y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=3) 

In [None]:
model = clf.fit(X_train, y_train)