In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Visualization 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime as dt
from datetime import datetime
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 10)
plt.rcParams["xtick.labelsize"] = 10
plt.figure(figsize=(16,10)) # this creates a figure 16 inch wide, 10 inch high
from pprint import pprint
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Load Data

In [3]:
df = pd.read_excel('descript.xlsx')
df=df.rename(columns={'Unnamed: 0':'code'})

In [4]:
df.shape

(455727, 4)

In [5]:
df.head()

Unnamed: 0,code,TEXT,UTC DATE,Sent_type
0,1,"Another reason I failed in love is I'm poor,I ...",2018-07-31 23:13:47,Benefit
1,2,2019 Mercedes-Benz C 300 and AMG C 43 First Dr...,2018-07-31 23:30:00,Neutral
2,3,"Happy birthday to the Audi chasing maniac, hop...",2018-07-31 22:52:45,Benefit
3,4,Come see Mercedes Benz stadium in Atlanta...,2018-07-31 23:02:25,Neutral
4,5,Italy took Cristiano and ever since then they’...,2018-07-31 22:53:15,Neutral


### Datatime issue

In [6]:
df['date']=pd.to_datetime(df['UTC DATE'])

In [7]:
df.dtypes

code                  int64
TEXT                 object
UTC DATE             object
Sent_type            object
date         datetime64[ns]
dtype: object

In [8]:
df['Date'] = df['date'].dt.date

In [9]:
df=df.drop('date', axis=1)

In [10]:
df=df.drop('UTC DATE', axis=1)

In [11]:
df.head()

Unnamed: 0,code,TEXT,Sent_type,Date
0,1,"Another reason I failed in love is I'm poor,I ...",Benefit,2018-07-31
1,2,2019 Mercedes-Benz C 300 and AMG C 43 First Dr...,Neutral,2018-07-31
2,3,"Happy birthday to the Audi chasing maniac, hop...",Benefit,2018-07-31
3,4,Come see Mercedes Benz stadium in Atlanta...,Neutral,2018-07-31
4,5,Italy took Cristiano and ever since then they’...,Neutral,2018-07-31


In [12]:
df['Date'] = pd.to_datetime(df['Date'])

In [13]:
df.dtypes

code                  int64
TEXT                 object
Sent_type            object
Date         datetime64[ns]
dtype: object

### Month Problem fix

In [14]:
df['month_time']=df['Date'].dt.to_period('M')

In [15]:
df.dtypes

code                   int64
TEXT                  object
Sent_type             object
Date          datetime64[ns]
month_time         period[M]
dtype: object

In [16]:
df['month_time'].unique()

<PeriodArray>
['2018-07', '2018-12', '2018-10', '2016-11', '2016-02', '2016-04', '2017-07',
 '2016-12', '2017-01', '2017-05', '2017-11', '2016-09', '2017-09', '2016-05',
 '2017-03', '2017-02', '2016-03', '2016-07', '2018-04', '2017-12', '2018-09',
 '2017-08', '2018-08', '2016-06', '2018-03', '2016-08', '2018-02', '2018-05',
 '2017-04', '2017-06', '2018-06', '2016-01', '2016-10', '2018-01', '2017-10',
 '2019-01', '2018-11']
Length: 37, dtype: period[M]

In [17]:
df.head()

Unnamed: 0,code,TEXT,Sent_type,Date,month_time
0,1,"Another reason I failed in love is I'm poor,I ...",Benefit,2018-07-31,2018-07
1,2,2019 Mercedes-Benz C 300 and AMG C 43 First Dr...,Neutral,2018-07-31,2018-07
2,3,"Happy birthday to the Audi chasing maniac, hop...",Benefit,2018-07-31,2018-07
3,4,Come see Mercedes Benz stadium in Atlanta...,Neutral,2018-07-31,2018-07
4,5,Italy took Cristiano and ever since then they’...,Neutral,2018-07-31,2018-07


In [18]:
df.Sent_type.value_counts()

Neutral    195719
Benefit    182533
Risky       77475
Name: Sent_type, dtype: int64

### dataset for match2018

In [37]:
df_march_2018=df[df.month_time == '2018-03']
df_march_2018.shape

(12369, 5)

In [38]:
df_march_2018.head()

Unnamed: 0,code,TEXT,Sent_type,Date,month_time
7489,7490,New model BMW Diesel pic.twitter.com/VzgjfpFNER,Neutral,2018-03-10,2018-03
7490,7491,#MuseAfrica: Stonebwoy gives 2 warning shots s...,Risky,2018-03-10,2018-03
7491,7492,Reminder about Trump’s threat to tax foreign c...,Risky,2018-03-10,2018-03
7492,7493,Meant to say Audi R8,Neutral,2018-03-10,2018-03
7493,7494,1943 #VW #Volkswagen #KDF type 60; as rare as ...,Neutral,2018-03-10,2018-03


In [20]:
df_march_2018['month_time'].unique()

<PeriodArray>
['2018-03']
Length: 1, dtype: period[M]

In [21]:
df_march_2018.Sent_type.value_counts()

Benefit    5209
Neutral    4370
Risky      2790
Name: Sent_type, dtype: int64

In [22]:
df_march_2018['Sent_type'] = df_march_2018['Sent_type'].str.replace('Neutral','Risky')

In [23]:
df_march_2018.Sent_type.value_counts()

Risky      7160
Benefit    5209
Name: Sent_type, dtype: int64

In [33]:
df_merge = pd.concat([df, df_march_2018]).drop_duplicates(['code'], keep='last').sort_values('code')

In [34]:
df_merge.shape

(455727, 5)

In [35]:
df_merge.Sent_type.value_counts()

Neutral    191349
Benefit    182533
Risky       81845
Name: Sent_type, dtype: int64

In [36]:
df.Sent_type.value_counts()

Neutral    195719
Benefit    182533
Risky       77475
Name: Sent_type, dtype: int64

In [42]:
df_merge.loc[7489]

code                                                     7490
TEXT          New model BMW Diesel pic.twitter.com/VzgjfpFNER
Sent_type                                               Risky
Date                                      2018-03-10 00:00:00
month_time                                            2018-03
Name: 7489, dtype: object

In [43]:
df_merge.loc[7492]

code                          7493
TEXT          Meant to say Audi R8
Sent_type                    Risky
Date           2018-03-10 00:00:00
month_time                 2018-03
Name: 7492, dtype: object

In [44]:
195719-4370

191349

### January 2016 dataset

In [45]:
df_jan_2016=df[df_merge.month_time == '2016-01']
df_jan_2016.shape

(14358, 5)

In [47]:
df_jan_2016.Sent_type.value_counts()

Neutral    7057
Benefit    5021
Risky      2280
Name: Sent_type, dtype: int64

### save file

In [None]:
df.to_excel('descript_baimani.xlsx')

### Loading new baimani file

In [None]:
df_label = pd.read_excel('descript_baimani.xlsx')
df_label=df.drop('Unnamed: 0',axis=1)

### save again to use in notebook

In [None]:
df_label.to_excel('descript_baimani.xlsx')