In [None]:
import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
from scipy import stats

## Load Data

In [None]:
# set id as index and make 'create_at' a Datetime object
data = pd.read_csv('dog_rates_tweets.csv', parse_dates=[1]).set_index(keys=['id']) 

## Filter Data

In [None]:
# retrieve rate from tweet. return None if not found or rate over 20
def findRating(s):
    match = re.search(r'(\d+(\.\d+)?)/10', s)
    if match:
        rate = match.groups(1)[0] # retrieve rate
        #filter ratings: mark rating as None if greater than 20
        if float(rate) > 20:
            return None
        else:
            return rate
    else:
        return None
    
# vectorize findRating
findRating = np.vectorize(findRating, otypes=[np.float])

# get rates from text
data['rate'] = findRating(data['text'])

# exclude outliers and texts with no ratings
data = data[pd.notnull(data['rate'])]

## Prepare for Linear Fitting

In [None]:
# helper function to retreive Timestamp of Datetime objects before using <scipy.stats.linregress>
# return None if argument is not Datetime object
def to_timestamp(dt):
    # check if argument is Datetime object
    if isinstance(dt, datetime.datetime):
        return dt.timestamp()
    else:
        return None

data['timestamp'] = data['created_at'].apply(to_timestamp)

## Linear Fitting

In [None]:
slope, intercept, r_value, p_value, std_err = stats.linregress(data['timestamp'],data['rate'])

## Data, slope and intercept of the best-fit line, and a scatterplot with fit line

In [None]:
data

In [None]:
slope, intercept

In [None]:
plt.xticks(rotation=25)
plt.plot(data['created_at'],data['rate'], 'b.', alpha=0.5)
plt.plot(data['created_at'],data['timestamp']*slope + intercept, 'r-', linewidth=3)
plt.show()

## End