In [1]:
import numpy as np
import pandas as pd

import os
import datetime
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


## Reading and preprocessing the article data from the GAStech Challenge

In [2]:
article_directory = 'data/articles/'

dates = []
contents = []
headers = []
mediums = [] 

# find dates, headers and main contents from article data 

for file in os.listdir(article_directory):
    file_o =  open(os.path.join(article_directory, file), 'r')
    content = file_o.read()
    content = content.replace('  ', ' ')
    medium = content.split('\n')[0]
    matches = re.findall(r'(\d+/\d+/\d+)',content)

    if matches == []:
        matches = re.findall(r'(\d+ \w+ \d+)', content)

        if matches == []:
            matches = re.findall(r'(\d+\w+ \d+)', content)
            date = datetime.datetime.strptime(matches[0], '%d%B %Y')
        else:
            date = datetime.datetime.strptime(matches[0], '%d %B %Y')

    else:
        try:
            date = datetime.datetime.strptime(matches[0], '%Y/%m/%d')
        except ValueError:
            date = datetime.datetime.strptime('0'+matches[0], '%m/%d/%Y')

    
    # split and format generated text content into dates, headers, contents and mediums
    split_content = content.split(matches[0])
    dates.append(date)
    headers.append(split_content[0].replace('\n\n', '\n').replace('\n\n', '\n').replace('\n \n', '\n').split('\n')[1])
    contents.append(split_content[-1].replace('\n', ' '))
    mediums.append(re.sub(r"(\w)([A-Z])", r"\1 \2", medium.replace(' ', '')))
    
    
    


In [3]:
# create dataframe from article dates, headers, content and their mediums
df = pd.DataFrame({'Date':dates, 'Header':headers, 'Content':contents, 'Medium':mediums})

In [4]:
df.head()

Unnamed: 0,Date,Header,Content,Medium
0,2012-09-08,BUMP OF PROTESTS IN ABILA IN RESPONSE TO THE C...,"ABILA, Kronos - the thousands of people s...",The Orb
1,2013-12-18,ENORMOUS IPO MAKES THE BILLIONAIRE OF SANJORGE,"CENTRUM, Tethys - the president and PRESIDE...",The Lightof Truth
2,2014-01-20,VOICES - a blog about what is important to the...,1018 - A fire alarm has gone off at GAStech ...,Homeland Illumination
3,2007-03-19,Four people have died in an enthusiastic disch...,Of nine years - the old boy initially survi...,The Continent
4,2012-11-11,THE DEMONSTRATION ATTRACTS THOUSANDS IN SPITE ...,"ABILA, Kronos - a heavy rain did not stop t...",Daily Pegasus


## Read and preprocess Email Correspondence data

In [5]:
# Make name formatting in email data consistent
# Make date format consistent with article data

df_email = pd.read_csv('data/email headers.csv',sep=",", encoding='cp1252')
df_email['Group'] = np.where(df_email['To'].apply(lambda x: len(x.split(',')))  > 1, True, False)
df_email['from']= df_email['From'].apply(lambda x: x.split('@')[0].replace('.', ''))
df_email['To'] = df_email['To'].str.split(', ')
df_email = df_email.explode('To').reset_index(drop=True)
df_email['to']= df_email['To'].apply(lambda x: x.split('@')[0].replace('.', ''))
df_email['to'] = df_email['to'].apply(lambda x: x.replace(' ', ''))
df_email['from'] = df_email['from'].apply(lambda x: x.replace(' ', ''))

TIME_FORMAT = '%m/%d/%Y %H:%M'
df_email['Date']= df_email['Date'].apply(lambda x: datetime.datetime.strptime(x, TIME_FORMAT))


In [6]:
# match names in email data to their departments from the employee records

df_h = pd.read_excel("data/EmployeeRecords.xlsx")

df_h['FullName'] = df_h['FirstName'] + ' '+ df_h['LastName']

df_hierarchy = df_h[['FullName','CurrentEmploymentType']]
df_hierarchy = df_hierarchy.rename(columns={'FullName':'to','CurrentEmploymentType':'from'})

deps = df_h['CurrentEmploymentType'].unique()
origins = ['Origin']*len(deps)

df_hierarchy2 = pd.DataFrame({'from':origins, 'to':deps})

df_hierarchy = pd.concat((df_hierarchy2, df_hierarchy)).reset_index(drop=True)
df_hierarchy['to'] = df_hierarchy['to'].apply(lambda x: x.replace(' ', ''))
df_hierarchy['from'] = df_hierarchy['from'].apply(lambda x: x.replace(' ', ''))
df_hierarchy['to'] = df_hierarchy['to'].apply(lambda x: x.replace('.', ''))
df_hierarchy['from'] = df_hierarchy['from'].apply(lambda x: x.replace('.', ''))

dep_dict = dict(zip(df_hierarchy.to, df_hierarchy['from']))
df_email['Department'] = df_email['from'].apply(lambda x: dep_dict[x])


## Sentiment analysis using VADER

In [10]:
# article sentiment analysis
analyzer = SentimentIntensityAnalyzer()
sentiment_scores = [analyzer.polarity_scores(x)['compound'] for x in df.Content]
df['Sentiment Score'] = sentiment_scores

In [11]:
# email sentiment analysis
analyzer = SentimentIntensityAnalyzer()
email_sentiment_scores = [analyzer.polarity_scores(x)['compound'] for x in df_email.Subject]
df_email['Sentiment Score'] = email_sentiment_scores

In [12]:
# save data
df.to_csv('modified_data/articles_preprocessed.csv')
df.head()

Unnamed: 0,Date,Header,Content,Medium,Cluster,Sentiment Score
0,2012-09-08,BUMP OF PROTESTS IN ABILA IN RESPONSE TO THE C...,"ABILA, Kronos - the thousands of people s...",The Orb,1,-0.9076
1,2013-12-18,ENORMOUS IPO MAKES THE BILLIONAIRE OF SANJORGE,"CENTRUM, Tethys - the president and PRESIDE...",The Lightof Truth,3,0.8187
2,2014-01-20,VOICES - a blog about what is important to the...,1018 - A fire alarm has gone off at GAStech ...,Homeland Illumination,4,-0.8828
3,2007-03-19,Four people have died in an enthusiastic disch...,Of nine years - the old boy initially survi...,The Continent,2,-0.9418
4,2012-11-11,THE DEMONSTRATION ATTRACTS THOUSANDS IN SPITE ...,"ABILA, Kronos - a heavy rain did not stop t...",Daily Pegasus,1,-0.7015


In [13]:
# save data
df_email.to_csv('modified_data/email_preprocessed.csv')
df_email.head()

Unnamed: 0,From,To,Date,Subject,Group,from,to,Department,Sentiment Score
0,Sven.Flecha@gastech.com.kronos,Isak.Baza@gastech.com.kronos,2014-01-06 08:39:00,GT-SeismicProcessorPro Bug Report,True,SvenFlecha,IsakBaza,InformationTechnology,0.0
1,Sven.Flecha@gastech.com.kronos,Lucas.Alcazar@gastech.com.kronos,2014-01-06 08:39:00,GT-SeismicProcessorPro Bug Report,True,SvenFlecha,LucasAlcazar,InformationTechnology,0.0
2,Kanon.Herrero@gastech.com.kronos,Felix.Resumir@gastech.com.kronos,2014-01-06 08:58:00,Inspection request for site,True,KanonHerrero,FelixResumir,Security,0.0
3,Kanon.Herrero@gastech.com.kronos,Hideki.Cocinaro@gastech.com.kronos,2014-01-06 08:58:00,Inspection request for site,True,KanonHerrero,HidekiCocinaro,Security,0.0
4,Kanon.Herrero@gastech.com.kronos,Inga.Ferro@gastech.com.kronos,2014-01-06 08:58:00,Inspection request for site,True,KanonHerrero,IngaFerro,Security,0.0
