# Data analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import re
from collections import Counter

pd.set_option('float_format', '{:f}'.format)

In [2]:
colnames = ['sent', 'id', 'date', 'query', 'name', 'tweet']
data = pd.read_csv('../data/raw/data.csv', encoding="ISO-8859-1", names=colnames, delimiter=',')

In [3]:
data.head()

Unnamed: 0,sent,tweet
0,0,"- Awww, that's a bummer. You shoulda got Da..."
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


In [4]:
np.shape(data)

(1600000, 2)

In [5]:
data['sent'].describe()

count   1600000.000000
mean          2.000000
std           2.000001
min           0.000000
25%           0.000000
50%           2.000000
75%           4.000000
max           4.000000
Name: sent, dtype: float64

In [6]:
data['sent'].unique()

array([0, 4])

In [7]:
data['id'].describe()

KeyError: 'id'

In [9]:
data['date'].describe()

count                          1600000
unique                          774363
top       Mon Jun 15 12:53:14 PDT 2009
freq                                20
Name: date, dtype: object

In [10]:
data['query'].describe()

count      1600000
unique           1
top       NO_QUERY
freq       1600000
Name: query, dtype: object

In [11]:
data['query'].unique()

array(['NO_QUERY'], dtype=object)

In [12]:
data['name'].describe()

count      1600000
unique      659775
top       lost_dog
freq           549
Name: name, dtype: object

In [8]:
data['tweet'].describe()

count     1600000
unique    1555629
top              
freq         2501
Name: tweet, dtype: object

In [9]:
data['word_count'] = data['tweet'].str.split().str.len()

In [10]:
data['word_count'].describe()

count   1600000.000000
mean         12.638969
std           6.977789
min           0.000000
25%           7.000000
50%          12.000000
75%          18.000000
max          64.000000
Name: word_count, dtype: float64

In [11]:
data['words'] = data['tweet'].str.split()

In [12]:
data['words']

0          [-, Awww,, that's, a, bummer., You, shoulda, g...
1          [is, upset, that, he, can't, update, his, Face...
2          [I, dived, many, times, for, the, ball., Manag...
3          [my, whole, body, feels, itchy, and, like, its...
4          [no,, it's, not, behaving, at, all., i'm, mad....
                                 ...                        
1599995    [Just, woke, up., Having, no, school, is, the,...
1599996    [TheWDB.com, -, Very, cool, to, hear, old, Wal...
1599997    [Are, you, ready, for, your, MoJo, Makeover?, ...
1599998    [Happy, 38th, Birthday, to, my, boo, of, alll,...
1599999                             [happy, #charitytuesday]
Name: words, Length: 1600000, dtype: object

In [13]:
Counter(re.findall('\w{3,}',' '.join(data['tweet'].head(800000)).lower())).most_common(100)

[('the', 257496),
 ('and', 153837),
 ('you', 103692),
 ('for', 98855),
 ('but', 84875),
 ('that', 82745),
 ('have', 82739),
 ('not', 74887),
 ('just', 63912),
 ('was', 59216),
 ('this', 52914),
 ('now', 52534),
 ('can', 52129),
 ('with', 50071),
 ('get', 45488),
 ('work', 45262),
 ('out', 43800),
 ('all', 42079),
 ('like', 40996),
 ('day', 39770),
 ('today', 37993),
 ('too', 37463),
 ('are', 33976),
 ('going', 33636),
 ('got', 33230),
 ('back', 33005),
 ('don', 32837),
 ('really', 31422),
 ('miss', 30620),
 ('want', 29779),
 ('what', 29289),
 ('still', 29199),
 ('good', 29172),
 ('from', 27992),
 ('sad', 27860),
 ('had', 27756),
 ('time', 27486),
 ('one', 27213),
 ('know', 26290),
 ('quot', 26166),
 ('about', 26119),
 ('its', 25488),
 ('they', 25008),
 ('home', 24463),
 ('there', 23891),
 ('lol', 23095),
 ('will', 22925),
 ('amp', 22465),
 ('last', 22426),
 ('wish', 22370),
 ('when', 22324),
 ('need', 22233),
 ('feel', 21916),
 ('think', 21848),
 ('bad', 21664),
 ('why', 21019),
 ('has

In [14]:
Counter(re.findall('\w{3,}',' '.join(data['tweet'].tail(800000)).lower())).most_common(100)

[('the', 265516),
 ('you', 197912),
 ('and', 149481),
 ('for', 117100),
 ('that', 82997),
 ('with', 64978),
 ('just', 62833),
 ('have', 62659),
 ('good', 61987),
 ('but', 48580),
 ('love', 47585),
 ('day', 47075),
 ('quot', 45921),
 ('was', 45285),
 ('your', 44471),
 ('are', 44018),
 ('this', 41505),
 ('all', 41349),
 ('can', 41313),
 ('now', 41292),
 ('out', 38314),
 ('like', 37458),
 ('get', 36458),
 ('lol', 36080),
 ('thanks', 34438),
 ('not', 33609),
 ('going', 30851),
 ('time', 30372),
 ('today', 30013),
 ('what', 30003),
 ('will', 29435),
 ('too', 28186),
 ('from', 28118),
 ('got', 27948),
 ('new', 26859),
 ('one', 26640),
 ('some', 26554),
 ('amp', 26243),
 ('see', 25772),
 ('know', 25710),
 ('great', 25265),
 ('about', 24291),
 ('back', 23808),
 ('night', 23514),
 ('how', 23349),
 ('well', 23263),
 ('haha', 21629),
 ('there', 21310),
 ('morning', 20795),
 ('happy', 20472),
 ('had', 20293),
 ('think', 19500),
 ('work', 19489),
 ('twitter', 19426),
 ('then', 19350),
 ('its', 1918

In [15]:
data

Unnamed: 0,sent,tweet,word_count,words
0,0,"- Awww, that's a bummer. You shoulda got Da...",17,"[-, Awww,, that's, a, bummer., You, shoulda, g..."
1,0,is upset that he can't update his Facebook by ...,21,"[is, upset, that, he, can't, update, his, Face..."
2,0,I dived many times for the ball. Managed to s...,17,"[I, dived, many, times, for, the, ball., Manag..."
3,0,my whole body feels itchy and like its on fire,10,"[my, whole, body, feels, itchy, and, like, its..."
4,0,"no, it's not behaving at all. i'm mad. why am...",20,"[no,, it's, not, behaving, at, all., i'm, mad...."
...,...,...,...,...
1599995,4,Just woke up. Having no school is the best fee...,11,"[Just, woke, up., Having, no, school, is, the,..."
1599996,4,TheWDB.com - Very cool to hear old Walt interv...,10,"[TheWDB.com, -, Very, cool, to, hear, old, Wal..."
1599997,4,Are you ready for your MoJo Makeover? Ask me f...,11,"[Are, you, ready, for, your, MoJo, Makeover?, ..."
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...,12,"[Happy, 38th, Birthday, to, my, boo, of, alll,..."
