Tqdm : Progress bar

##### Setup #####

In [2]:
#Imports
!pip install gensim
import numpy as np
import sklearn
import sklearn.linear_model
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.models import Word2Vec

import gensim.downloader as api

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



Load CSV file, leaving only the collumns we might be interested in.

In [3]:
def getParts():
    dropList = ['domain', 'url', 'scraped_at', 'updated_at', 'title', 'authors', 'keywords', 'meta_keywords', 'tags', 'summary' ] # 'source'

    df = pd.read_csv('csvFile.csv', nrows=100000)
    
    # Filter fake
    df_fake = df.loc[df['type'] == 'fake']
    df_conspiracy = df.loc[df['type'] == 'conspiracy']


    #filter reliable
    df_reliable = df.loc[df['type'] == 'reliable']
    df_political = df.loc[df['type'] == 'political']

    #Concat
    df_reliable = pd.concat([df_political, df_reliable], ignore_index=True)
    df_filtered = pd.concat([df_fake, df_reliable], ignore_index=True)

    # Write DataFrame to CSV file
    print("duplicates: ", df_filtered.duplicated(subset=['content']).sum())
    df_filtered = df_filtered.drop(dropList, axis=1)
    df_filtered.drop_duplicates(subset=['content'], inplace=True)
    df_filtered.to_csv('readyData.csv', index=False)
    
    print("Fake / reliable")
    print(df_fake.index)
    print(df_reliable.index)
    print("")
    print("loaded index")
    print(df.index)
    print("")
    print("filtered")
    print(df_filtered)
    print("")
    print("Creates a csv file called: readyData.csv")
getParts()

duplicates:  8498
Fake / reliable
Int64Index([   27,    28,    29,    30,    31,    32,    33,    34,    58,
               71,
            ...
            89296, 89297, 89298, 89299, 89300, 89301, 89304, 89307, 94742,
            99090],
           dtype='int64', length=45768)
RangeIndex(start=0, stop=27660, step=1)

loaded index
RangeIndex(start=0, stop=100000, step=1)

filtered
       Unnamed: 0      id      type  \
0              27      34      fake   
1              28      35      fake   
2              29      36      fake   
3              30      37      fake   
4              31      38      fake   
...           ...     ...       ...   
73421        7790   96593  reliable   
73423        1189  101029  reliable   
73424        4169  105092  reliable   
73425        4440  105498  reliable   
73427        9582  112137  reliable   

                                                 content  \
0      Headline: Bitcoin & Blockchain Searches Exceed...   
1      Water Cooler 1/25/18

##### Data overview #####

In [4]:
data01 = pd.read_csv('readyData.csv')
def types(inp):
    results = inp
    results = results.dtypes
    return results

print("Types:")
print(types(data01))

print("")
print ('Count of Null: ' )
data01.isnull().sum()

Types:
Unnamed: 0            int64
id                    int64
type                 object
content              object
inserted_at          object
meta_description     object
source              float64
dtype: object

Count of Null: 


Unnamed: 0              0
id                      0
type                    0
content                 0
inserted_at             0
meta_description    50357
source              64930
dtype: int64

##### Preproccesing #####

In [6]:
##### -- Imports -- #####
import re
import csv
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#Remember to run the line below the first time 
# nltk.download('punkt') 

fileRaw = 'readyData.csv'


''' Creates dataframe (Run fuctions) '''
def createDataframe(input): 
    df = pd.read_csv(input)
    df = cleanContent(df, 'content')
    return df

''' Cleans and tokenizes text  '''
sno = nltk.stem.SnowballStemmer('english')
def cleanContent(input, columnName):
    input[columnName] = input[columnName].str.lower()
    regexList = ['\.', ':', '&', ',', '\?', ' us ', '!', ';', '\$', '%', '\(', '\)', '\[', '\]']
    stop_words = set(stopwords.words('english'))
    regexList += [r'\b{}\b'.format(word) for word in stop_words]
    pattern = re.compile('|'.join(regexList))
    input[columnName] = input[columnName].apply(lambda x: pattern.sub('', x))
    input[columnName] = input[columnName].str.replace(r'\b\d+-\d+-\d+-\d+\b', 'phone', regex=True)
    input[columnName] = input[columnName].str.replace(r'https?://\S+|\bhttp://\S+', 'url', regex=True)
    input[columnName] = input[columnName].str.replace(r'\b\d+\b', 'number', regex=True)
    input[columnName] = input[columnName].str.replace(r"\b\w\b\s?\b", '', regex=True)
    input[columnName] = input[columnName].str.replace(r"['`.*@-]", '', regex=True)
    input[columnName] = input[columnName].str.replace(r'\s+', ' ', regex=True)
    for i in range(0, len(input[columnName])):
        # print (input.at[i, columnName]) 
        colElm = input.at[i, columnName]
        colElm = nltk.word_tokenize(colElm)
        stemmed_words = []
        for word in colElm:
            stemmed_words.append(sno.stem(word))
        stemmed_words = ' '.join(stemmed_words)
        input.at[i, columnName] = stemmed_words
    return input

'''Converts to csv File'''
def run(inp):
    inp = inp.to_csv('cleanedNews.csv', index = True)

##### -- Calls -- #####
run(createDataframe(fileRaw))

##### Content overview #####

In [9]:
##### -- Imports -- #####
import matplotlib.pyplot as plt 
import itertools

##### --  Global variables -- #####
fileCleaned = 'cleanedNews.csv'



# dfClean = pd.read_csv(nameOfCleanedCSV)
# dfRaw = pd.read_csv(nameOfRawCSV)

##### --  Functions -- #####
''' Creates dictionary of 100 most used times'''
def wordDic(input):
    file = open(input, 'r',  errors="surrogateescape")
    read = file.read().lower()
    words = read.split()  
    dictionary = {}
    for i in words:
        if i in dictionary:
            dictionary[i] += 1  
        else:
            dictionary[i] = 1
    sort = dict(sorted(dictionary.items(), key=lambda x: x[1], reverse=True))
    words = dict(itertools.islice(sort.items(), 100))
    return words

def something(input):
    resultList = list(wordDic(input).items())
    lst = []
    for elm in resultList:
        lst.append(elm[0])
    return lst


##### --  Prints -- #####
print("Most used words in clean and raw")
mostUsedWordsClean = (something(fileCleaned))
mostUsedWordsRaw = (something(fileRaw))
print(mostUsedWordsClean)
print(mostUsedWordsRaw) 

print("\nWords that apear in both list of words:")
doubleAppearances = []
for word in mostUsedWordsClean:
    if word in mostUsedWordsRaw:
        doubleAppearances.append(word)
print(doubleAppearances)

Most used words in clean and raw
['number', '’', '“', '”', 'one', 'trump', 'state', 'would', 'peopl', 'time', 'blockchain', 'year', 'like', 'new', 'said', 'use', 'also', '–', 'report', 'make', 'next', 'get', 'market', 'two', 'go', '01:19:41.756632,,', 'even', 'think', 'stock', 'say', 'first', 'work', '``', 'day', 'presid', 'mani', 'govern', 'search', 'world', 'bitcoin', 'american', 'right', 'take', 'could', 'need', 'come', '—', 'way', 'exceed', 'know', 'nation', 'sourc', 'see', 'may', 'headlin', 'call', 'fact', "''", 'want', 'the', 'includ', 'stori', 'news', 'last', 'thing', 'well', 'countri', 'much', 'support', 'look', 'back', 'obama', 'hous', 'law', 'live', 'democrat', 'good', 'help', 'polit', 'public', '‘', 'show', 'power', 'week', 'republican', 'unit', 'made', 'war', 'chang', 'life', 'part', 'group', 'follow', 'inform', 'read', 'end', 'post', 'system', 'america', 'point']
['the', 'of', 'to', 'and', 'a', 'in', 'is', 'that', 'for', 'on', 'are', 'with', 'as', 'it', 'this', 'be', 'by',

The above code block solves the following tasks:
 - Cheks the effect of the cleaning, based on the words apearing in the most used list.

In [11]:
def allWordsToDic(input):
    file = open(input, 'r',  errors="surrogateescape")
    read = file.read().lower()
    words = read.split()  
    dictionary = {}
    for i in words:
        if i in dictionary:
            dictionary[i] += 1  
        else:
            dictionary[i] = 1
    return dictionary

print("the word number apears: " + str(allWordsToDic(fileCleaned)['number']) + ' times')
print("the word phone apears: " + str(allWordsToDic(fileCleaned)['phone']) + ' times')
print("the word url apears: " + str(allWordsToDic(fileCleaned)['url']) + ' times')

the word number apears: 587958 times
the word phone apears: 6999 times
the word url apears: 160 times


##### Ready data for baseline model #####

In [15]:
# Split data
df = pd.read_csv(('cleanedNews.csv'))
 
# Creating a dataframe with 80% of the data
part_80 = df.sample(frac = 0.8)
part_80.to_csv('split80_Train.csv', index=False)

# Creating a dataframe with the rest (20%)
rest_part = df.drop(part_80.index) 
rest_part.to_csv('split20_temp.csv', index=False)

# Creating a new dataframe to split the 20 % for test and validation
df10 = pd.read_csv('split20_temp.csv')

# Creating a dataframe with 50% of the data / 10% of the whole dataset 
part_50 = df10.sample(frac=0.5)
part_50.to_csv('split10_test.csv', index=False)

# Creating a dataframe with 50% of the data / 10% of the whole dataset 
part_50_2 = df10.drop(part_50.index)
part_50_2.to_csv('split10_val.csv', index=False)

print("\n80% of DataFrame:")
print(part_80.shape)

print("\n10% of DataFrame:")
print(part_50.shape)

print("\nrest of the 10% of DataFrame:")
print(part_50_2.shape)


80% of DataFrame:
(51944, 8)

10% of DataFrame:
(6493, 8)

rest of the 10% of DataFrame:
(6493, 8)
