In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
from datetime import datetime
import time
import calendar
import random
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from textblob import Word
import shap
import IPython
#To process the text, we are using the nltk packages
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jack\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
#Reads the cleaned csv dataset
main_df=pd.read_csv("Group_17_data_cleaned.csv")
main_df.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'SentimentTitle', 'SentimentHeadline', 'Facebook', 'GooglePlus',
       'LinkedIn', 'PublishTime', 'Weekday', 'Facebook_scaled',
       'GooglePlus_scaled', 'LinkedIn_scaled', 'SentimentTitle_Category',
       'SentimentHeadline_Category'],
      dtype='object')

In [None]:
#Based on the number of news published, separates the news outlets into four categories A, B, C and D.
#Then creates a new column with those categories
q1 = np.percentile(main_df['Source'].value_counts().unique(), 25, interpolation = 'midpoint')
q2 = np.percentile(main_df['Source'].value_counts().unique(), 50, interpolation = 'midpoint')
q3 = np.percentile(main_df['Source'].value_counts().unique(), 75, interpolation = 'midpoint')
source_df = pd.DataFrame(main_df['Source'].value_counts())
main_df['Source_type'] = main_df['Source'].apply(lambda x: 'A' if source_df['Source'][x]<=q1 else 'B'
                                                     if source_df['Source'][x]<=q2 else 'C' if source_df['Source'][x]<=q3 else 'D')
main_df.drop(columns=['Source'], inplace = True)

In [None]:
#Splits date and time into separate columns from published date column
def convert_to_dt(df):
  df['PublishDate'] = pd.to_datetime(df['PublishDate'])
  df['PublishTime'] = df['PublishDate'].dt.time
  df['PublishDate'] = df['PublishDate'].dt.date

convert_to_dt(main_df)
#convertes time into seconds
#Creates additional columns of time in sin and time in cos format
#Also extracts Month from the date and takes it as separate column
def work_with_time_and_date(df):
    df['PublishTime'] = df.PublishTime.apply(lambda x: (x.hour * 60 + x.minute) * 60 + x.second)
    seconds_in_day = 24*60*60
    df['PublishSinTime'] = df['PublishTime'].apply(lambda x: np.sin(2*np.pi*x/seconds_in_day))
    df['PublishCosTime'] = df['PublishTime'].apply(lambda x: np.cos(2*np.pi*x/seconds_in_day))
    df.drop(columns='PublishTime',inplace=True)
    df['PublishMonth'] = main_df['PublishDate'].apply(lambda x: calendar.month_name[x.month])

    return df
main_df = work_with_time_and_date(main_df.copy())

In [None]:
#Reads the dataset again for final merging
sources= ['Facebook', 'GooglePlus', 'LinkedIn']
topics = ['Economy','Microsoft', 'Obama', 'Palestine']
folder_path = './'
df = {}
for source in sources:
  for topic in topics:
    file_name = f'{source}_{topic}.csv'
    file_path = f'{folder_path}{file_name}'
    df[f'{source}_{topic}'] = pd.read_csv(file_path)
for idf in df:
    for col in df[idf]:
        if col == 'IDLink':
            continue
        df[idf][col] += 1


In [None]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Merges the timeseries dataframes in to the main dataframe
def change_df(df, n, col_name_prefix):
    df['TS0'] = 0
    new_df = df[['IDLink']]
    for i,j in zip(range(1,(72//n)+1), range(n, 73, n)):
        new_df[f'{col_name_prefix}_t{i}'] = df[f'TS{j}'] - df[f'TS{j-n}']

    return new_df
#Merges facebook, google plus and linkedin dataframes to in to a single dataframe
def merge_df(fb_df, gp_df, li_df, topic):

    new_fb_df = change_df(fb_df, 3*6, 'fb')
    new_gp_df = change_df(gp_df, 3*6, 'gp')
    new_li_df = change_df(li_df, 3*6, 'li')

    new_final_df = new_fb_df.merge(new_gp_df.merge(new_li_df, on = 'IDLink', how='outer'), on = 'IDLink', how='outer')

    return new_final_df

#Merges separate columns of different topics in to the main data frame
def merge_all_df(main_df, df):
    economy_df, microsoft_df, obama_df, palestine_df = main_df[main_df.Topic == 'economy'], main_df[main_df.Topic == 'microsoft'], main_df[main_df.Topic == 'obama'], main_df[main_df.Topic == 'palestine']
    df_time_dict = {}
    for topic in ['Economy','Microsoft','Obama','Palestine']:
        df_time_dict[topic] = merge_df(df[f'Facebook_{topic}'], df[f'GooglePlus_{topic}'], df[f'LinkedIn_{topic}'], topic )
    economy_df = economy_df.merge(df_time_dict['Economy'], on='IDLink',how = 'left')
    microsoft_df = microsoft_df.merge(df_time_dict['Microsoft'], on='IDLink',how = 'left')
    obama_df = obama_df.merge(df_time_dict['Obama'], on='IDLink',how = 'left')
    palestine_df = palestine_df.merge(df_time_dict['Palestine'], on='IDLink',how = 'left')
    final_main_df = economy_df.append(microsoft_df.append(obama_df.append(palestine_df)))
    temp = final_main_df[final_main_df.isnull().any(axis=1)]
    null_indexes = temp[temp['Facebook'] != 1].index
    for ind in null_indexes:
        final_main_df['fb_t1'][ind] = 1

    final_main_df.fillna(0, inplace=True)

    return final_main_df

#returns list of columns to remove
def cols_to_remove(df):
    cols = []
    for col in df.columns:
        try:
            int(col)
            cols.append(col)
        except:
            continue

    return cols

#Processes the title along with the headline using TfidfVectorizer and NLTK
def title_headline_processing():
    st = PorterStemmer()
    main_df['title_headline'] = main_df['Title']+', '+main_df['Headline']
    main_df['title_headline'] = main_df['title_headline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    main_df['title_headline'] = main_df['title_headline'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english', max_df=0.7, min_df = 150, max_features = 1000)
    tfidf_wm = tfidfvectorizer.fit_transform(main_df['title_headline'])
    tfidf_tokens = tfidfvectorizer.get_feature_names()
    df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(), columns=tfidf_tokens)
    df_tfidfvect['IDLink'] = main_df['IDLink'].reset_index(drop=True).copy()

    trash_cols = cols_to_remove(df_tfidfvect)
    df_tfidfvect.drop(columns=trash_cols, inplace=True)

    return df_tfidfvect

main_df = merge_all_df(main_df.copy(), df.copy())
tfidfvect_df = title_headline_processing()
main_df = main_df.merge(tfidfvect_df, on='IDLink', how = 'left')
#Removes columns that does not influence the outcome
irrelevant_columns = ['IDLink', 'Title', 'Headline', 'PublishDate', 'Facebook',
                      'SentimentTitle','SentimentHeadline', 'GooglePlus',
                      'LinkedIn','title_headline']
main_df.drop(columns = irrelevant_columns, inplace = True)

#Performes one hot encoding of the categorical columns
main_df = pd.get_dummies(main_df, columns=['Topic','SentimentTitle_Category','SentimentHeadline_Category','Source_type','PublishMonth','Weekday'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://p

In [None]:
#Saves the final dataset suitable for training
main_df.to_csv("Final.csv",index=False)