In [1]:
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob, Word
import numpy as np
import pandas as pd
import re

# Machine Learning Pipeline
Here is the pipeline that we will be following in order to obtain the final "on the fly" movie recommendation for a never seen twitter user.

### Data selection
We select only the english tweets, all talking about movies using our filter, and after retrieving the names of each distinct user, we can match them against all of the twitter database to get the final dataset. The said dataset contains all of the tweets for each user that ever rated a movie on IMDB on twitter.

### Twitter user vectors
For the totality of tweets for each twitter user, we compute the vector the following way, we remove:
    - Stopwords
    - Numbers
    - Punctuation
    - Special characters
    - Hyperlinks
We then proceed to stem each term after lowering, and compute 2-grams. With this new dataset, we compute a simple TF-IDF using all of the tweets of each user as a document, and each found word as a feature.

### Similarity
Everytime we see a new user, we compute the vector of all his/her tweets, and then compare it to all of the other users by computing their cosine similarity, we finally then get a new list of similarities that we can sort to find out with which other user that talks about movies our own user has similarities.

## Movie prediction
Using the matrix of TF-IDF joined with the new user, we can compute its similarity to every other user, and pick the few highest similarities. Using these best matches, and their liked movie genres, we can recommend a list of ordered movies to the new user.


### Bonus
We probably will have time by the end of the project to dive deep into the data to leverage some interesting stories for each user.

-------------------------

# Spark

In [None]:
# language, id, date, user_id, tweet
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
tweets = sc.textFile("/datasets/tweets-leon")
good_tweets = tweets.map(lambda s: tuple(s.split('\t'))).filter(lambda s: s[0] == 'en')

# There are 12 488 903 101 tweets in english, in the dataset

def filter_tweets_about_movies(row):
    """
    This function is used to filter out tweets based on if they rate
    an IMDB movie or not. 
    Returns a True or False depending on wether they rate a movie.
    """
    if len(row) < 5:
        return False
    return row[4].lower().find('imdb') > -1 and row[4].lower().find('i rated') > -1

user_about_movies = good_tweets.filter(lambda x: filter_tweets_about_movies(x)).map(lambda x: x[3]).distinct()
user_as_key = good_tweets.filter(lambda x: x in user_about_movies).map(lambda x: (x[3],x[4]))

# Sampled tweets

In [2]:
tweets = pd.read_csv('data/tweets_sampled.csv', delimiter='|', names=['language', 'id', 'date', 'user', 'text'], header=None, encoding='utf8')
tweets = tweets.set_index('id')
tweets.text.replace('&amp;', '&', regex=True,inplace=True)

In [3]:
def get_rate_and_movies(tweet, rate=True):
    m = re.search('I rated (.+)\s(\d+/\d\d)', tweet )
    if rate:
        return m.group(2)
    return m.group(1)

In [4]:
def get_users(row):
    return row['text'].lower().find('imdb') > -1 and row['text'].lower().find('i rated') > -1

In [5]:
users = tweets[tweets.apply(lambda row: get_users(row), axis=1)].copy()

In [6]:
users['rate_movie'] = users.apply(lambda row: get_rate_and_movies(row['text']), axis=1)
users['title'] = users.apply(lambda row: get_rate_and_movies(row['text'], False), axis=1)

users['temp'] = 1
users.head()

Unnamed: 0_level_0,language,date,user,text,rate_movie,title,temp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
257268213925163008,en,Sat Oct 13 23:54:56 +0000 2012,thalassini_ap,I rated The Skin I Live In 7/10 http://t.co/9K...,7/10,The Skin I Live In,1
356986999091105792,en,Tue Jul 16 04:01:47 +0000 2013,mookieineugene,I rated Tyrannosaur 8/10 http://t.co/7ALLdUDzz...,8/10,Tyrannosaur,1
204711915803512832,en,Mon May 21 23:14:58 +0000 2012,loyacmoiveclub,I rated Melancholia 7/10 http://t.co/GqofMiC5 ...,7/10,Melancholia,1
339882044429594624,en,Wed May 29 23:12:48 +0000 2013,CamShel06,I rated Safety Not Guaranteed 7/10 http://t.co...,7/10,Safety Not Guaranteed,1
363269277907828737,en,Fri Aug 02 12:05:19 +0000 2013,chediJ,I rated This Means War 6/10 http://t.co/5LKEhM...,6/10,This Means War,1


In [7]:
def date_parser(date):
    # returns the date under day/month/year format
    months = ['Jan', 'Feb', 'Mar',
              'Apr', 'May', 'Jun', 
              'Jul', 'Aug', 'Sep', 
              'Oct', 'Nov', 'Dec']
    month_mapper = { month: months.index(month) + 1 for month in months }
    date = date.split()
    return "{}/{}/{}".format(date[2], month_mapper[date[1]], date[-1])
    
tweets.date = tweets.date.apply(func=date_parser)
tweets['datetime'] = pd.to_datetime(tweets.date, format='%d/%m/%Y')
tweets.drop('date', axis=1, inplace=True)

In [8]:
# create a new column text_clean with the preprocessed tweet
f1 = lambda t: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", t.lower()).split())
f = lambda t: ' '.join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", t.lower()).split())
tweets['text_clean'] = tweets['text'].replace(r'\\n', ' ', regex=True).map(f)

# Movies

In [9]:
#tmdb_credits = pd.read_csv('data/tmdb_5000_credits.csv', delimiter=',')
tmdb_movies = pd.read_csv('data/tmdb_5000_movies.csv', delimiter=',', parse_dates=['release_date'])

In [10]:
tmdb_movies_df = tmdb_movies[['id', 'original_language', 'original_title', 'popularity', 'release_date', 'title', 'vote_average']]
tmdb_movies_df = tmdb_movies_df.set_index('id')
tmdb_movies_df = tmdb_movies_df.sort_values('vote_average', ascending=False)
tmdb_movies_df['temp'] = 1
tmdb_movies_df.head()

Unnamed: 0_level_0,original_language,original_title,popularity,release_date,title,vote_average,temp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
89861,en,Stiff Upper Lips,0.356495,1998-06-12,Stiff Upper Lips,10.0,1
361505,en,Me You and Five Bucks,0.094105,2015-07-07,Me You and Five Bucks,10.0,1
78373,en,"Dancer, Texas Pop. 81",0.376662,1998-05-01,"Dancer, Texas Pop. 81",10.0,1
40963,en,Little Big Top,0.0921,2006-01-01,Little Big Top,10.0,1
346081,en,Sardaarji,0.296981,2015-06-26,Sardaarji,9.5,1


In [11]:
def similarity(title1, title2):
    return SequenceMatcher(None,title1, title2).ratio() > 0.85

In [12]:
#Get df with twwets about movie link to the movie
merge_df = pd.merge(tmdb_movies_df, users, on='temp')
merge_df = merge_df[merge_df.apply(lambda row: similarity(row['title_x'], row['title_y']), axis=1)]
print(merge_df.shape)
merge_df.head()

(11, 13)


Unnamed: 0,original_language,original_title,popularity,release_date,title_x,vote_average,temp,language,date,user,text,rate_movie,title_y
3501,en,Jurassic Park,40.413191,1993-06-11,Jurassic Park,7.6,1,en,Sun Nov 25 01:01:54 +0000 2012,kxi,I rated Jurassic Park III 5/10 http://t.co/dUG...,5/10,Jurassic Park III
9653,en,Life as a House,5.706767,2001-10-25,Life as a House,7.2,1,en,Sun Jul 22 20:35:19 +0000 2012,rymdforskarn,I rated Life as a House 9/10 http://t.co/I4HnI...,9/10,Life as a House
10629,en,The Proposition,14.617944,2005-10-06,The Proposition,7.1,1,en,Sat Aug 31 22:31:16 +0000 2013,iEDWINnl,Now that's silly crap. Wasting my time. I rate...,1/10,The Proposition
13556,en,Monsters University,89.186492,2013-06-20,Monsters University,7.0,1,en,Wed Jul 24 18:13:26 +0000 2013,jckfrdplus44,"I rated Monsters University 8/10. Awesome, fun...",8/10,Monsters University
14747,da,Melancholia,36.173598,2011-05-26,Melancholia,7.0,1,en,Mon May 21 23:14:58 +0000 2012,loyacmoiveclub,I rated Melancholia 7/10 http://t.co/GqofMiC5 ...,7/10,Melancholia


# similarity between users

## 1 clean data

we get a csv with the username, and text composed by all the tweet for the given user that have talked about movies. thus we have to preprocess it, in order to have clean tweets ready to be used.

In [13]:
csv_df = tweets.reset_index()[['user', 'text']].groupby('user').sum()

In [14]:
def preprocess_data(df):
    data = df.copy()
    # We start to delete the link, and all the symbol in the tweets
    f = lambda t: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(\d+)", " ", t.lower()).split())
    data['text'] = data['text'].replace(r'\\n', ' ', regex=True).map(f)
    # then we stem our words
    data['text'] = [Word(word).stem() for word in data['text']]
    return data

In [15]:
tweets_clean = preprocess_data(csv_df)
tweets_clean.head()

Unnamed: 0_level_0,text
user,Unnamed: 1_level_1
000005634567754,rt aa
00001Belieber,back to school supplies haul giveaway
0000O_O000,rt ketchens never give up on yourself once you...
0000nooney,rt da world would b betta if everybody worry b...
0000sui,kisses ur tears away


## 2 Tf-IDF and make vectors

Then we compute the tf-idf. Thus, we got a vecors for each user that we could compare with the vectors of any other user in order to make our recommander movie system for this user.

In [16]:
def make_vectors(df):
    '''
    with a user and all his tweets we can create a valid vectors in order
    to compare it with all the user that have talked about movies 
    and then, recommanded him a movie
    '''
    # We build up a vectorizer to create our tf-idf matrix
    vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), stop_words='english')
    vectors = vectorizer.fit_transform(df['text'])
    return vectorizer, vectors

In [17]:
vectorizer, tf_idf = make_vectors(tweets_clean)
print("Here is the shape of our TFIDF sparse matrix: {} rows and {} columns".format(tf_idf.shape[0], tf_idf.shape[1]))

Here is the shape of our TFIDF sparse matrix: 1804254 rows and 5030835 columns
