In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('twcs.csv')

In [3]:
df.shape

(2811774, 7)

<h3> ------------------------------------- <h3>
<h3> Top Twitter Users by Number of Tweets <h3>

In [4]:
count = df['author_id'].value_counts()

In [5]:
grouped = df.groupby('author_id').nunique()

In [6]:
grouped['count'] = count

In [7]:
topUsers = grouped.sort_values(by=['count'], ascending=False)

In [8]:
topUsers['count'].reset_index(name='count').head(20)

Unnamed: 0,author_id,count
0,AmazonHelp,169840
1,AppleSupport,106860
2,Uber_Support,56270
3,SpotifyCares,43265
4,Delta,42253
5,Tesco,38573
6,AmericanAir,36764
7,TMobileHelp,34317
8,comcastcares,33031
9,British_Airways,29361


<h3> ----------------------- <h3>
<h3> Top Authors by Response <h3>

In [12]:
responseCount = df.groupby('author_id')['response_tweet_id'].count()

In [13]:
grouped = df.groupby('author_id').nunique()

In [14]:
grouped['responseCount'] = responseCount

In [15]:
topAuthorsResponse = grouped.sort_values(by='responseCount', ascending=False)

In [16]:
topAuthorsResponse['responseCount'].reset_index(name='responseCount').head(10)

Unnamed: 0,author_id,responseCount
0,AmazonHelp,85274
1,AppleSupport,31564
2,Uber_Support,18036
3,VirginTrains,15235
4,AmericanAir,14556
5,SpotifyCares,13786
6,Delta,12014
7,Tesco,11148
8,GWRHelp,10915
9,VerizonSupport,10585


<h3> -------------- <h3>
<h3> Tweets by Date <h3>

In [17]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [18]:
count = df['created_at'].dt.date.value_counts()

In [20]:
grouped = df.groupby(df['created_at'].dt.date).nunique()

In [21]:
grouped['count'] = count

In [24]:
tweetsByDate = grouped.sort_values(by=['count'], ascending=False)

In [25]:
tweetsByDate['count'].reset_index(name='count').head(10)

Unnamed: 0,created_at,count
0,2017-11-07,62793
1,2017-10-27,59136
2,2017-11-08,58169
3,2017-11-06,57981
4,2017-11-03,57233
5,2017-12-01,53857
6,2017-11-14,53731
7,2017-11-29,53610
8,2017-11-28,53538
9,2017-11-30,53476


<h3> -------------- <h3>
<h3> Data Cleaning <h3>

In [26]:
df.isna().any()

tweet_id                   False
author_id                  False
inbound                    False
created_at                 False
text                       False
response_tweet_id           True
in_response_to_tweet_id     True
dtype: bool

In [27]:
df.isna().sum()

tweet_id                         0
author_id                        0
inbound                          0
created_at                       0
text                             0
response_tweet_id          1040629
in_response_to_tweet_id     794335
dtype: int64

In [28]:
df.duplicated()

0          False
1          False
2          False
3          False
4          False
           ...  
2811769    False
2811770    False
2811771    False
2811772    False
2811773    False
Length: 2811774, dtype: bool

In [29]:
df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,2017-10-31 22:10:47+00:00,@115712 I understand. I would like to assist y...,2,3.0
1,2,115712,True,2017-10-31 22:11:45+00:00,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,2017-10-31 22:08:27+00:00,@sprintcare I have sent several private messag...,1,4.0
3,4,sprintcare,False,2017-10-31 21:54:49+00:00,@115712 Please send us a Private Message so th...,3,5.0
4,5,115712,True,2017-10-31 21:49:35+00:00,@sprintcare I did.,4,6.0
...,...,...,...,...,...,...,...
2811769,2987947,sprintcare,False,2017-11-22 08:43:51+00:00,"@823869 Hey, we'd be happy to look into this f...",,2987948.0
2811770,2987948,823869,True,2017-11-22 08:35:16+00:00,@115714 wtf!? I’ve been having really shitty s...,2987947,
2811771,2812240,121673,True,2017-11-23 04:13:07+00:00,@143549 @sprintcare You have to go to https://...,,2812239.0
2811772,2987949,AldiUK,False,2017-11-22 08:31:24+00:00,"@823870 Sounds delicious, Sarah! 😋 https://t.c...",,2987950.0


<h3> --------------------- <h3>
<h3> Methods to Clean Text <h3>

In [30]:
# Remove extra whitespace
# Pad strings with extra whitespace or leading zeros in the case of numerical values
# Fix casing to be all upper or lower case, so casing can be ignored later
# Remove emojis if needed

<h3> --------------------- <h3>
<h3> Other Things to Clean <h3>

In [31]:
# Keep data within range for context of problem => Keep dates in certain range
# Check for any outliers and remove them if they don't work in context of problem
# Validate Data after cleaning to confirm that it matches the rules and context of data and problem
# Change datatypes to be all the same => i.e. make sure all numerical values are an int or float