In [1]:
import pandas as pd
import sys
import datetime
import time
import numpy as np

### README

Important note: Don't push the dataset to Github. The file paths below match my computer but may need to be adjusted for wherever you saved the data.

Definitions of terms:

- full_ratings - df of all 25M ratings from MovieLens
- ratings_sample - df of 5% of the full ratings csv
- ratings - df merged ratings_sample w/ movies to include movie title
- movies - df with one row for every film
- tags - df with one row for every tag
- wiki_list - df with cult movies, release year,  director
- oscars - df with oscar best picture nominees
- links - df with movieid, imdbid, tmdbid

#### Import data and create dataframes

In [2]:
# Import 25M ratings dataset as 'full_ratings'
full_ratings = pd.read_csv('../ml-25m/ratings.csv')

In [None]:
# Descriptive statistics of number of ratings per movie (full dataset)
full_ratings.groupby('movieId')['rating'].count().sort_values().describe()

In [None]:
# Save random sample (5% of all rows) to 'ratings' dataframe
# ratings = full_ratings.sample(frac = 0.01) 
all_movies = pd.read_csv('../ml-25m/movies.csv')
tags = pd.read_csv('../ml-25m/tags.csv')
wiki_list = pd.read_csv('../Wikipedia_cult.csv')
oscars = pd.read_csv('../oscars.csv')
links = pd.read_csv('../ml-25m/links.csv', converters={'imdbId': lambda x: str(x)})

## Filter all_movies

#### Add columns to Movies DataFrame
- Average rating per movie (avg_rating)
- Total number of ratings per movie (ratings_count)

In [None]:
#all_movies.set_index('movieId',inplace = True)
#groupedmovies = full_ratings.groupby('movieId')
#all_movies['avg_rating'] = groupedmovies.mean()['rating']
#all_movies['ratings_count'] = groupedmovies.count()['rating']

In [None]:
# DataFrame of movie averages
avgmovies = pd.DataFrame(full_ratings.groupby('movieId')['rating'].mean())
# DataFrame of ratings count per movies
countmovies = pd.DataFrame(full_ratings.groupby('movieId')['rating'].count())
# Merge into one DataFrame
new_movie_cols = avgmovies.merge(countmovies, left_index=True, right_index=True)
new_movie_cols.columns = ['avg_rating','ratings_count']
# Merge onto original movies dataframe
all_movies = all_movies.merge(new_movie_cols, left_on='movieId', right_index=True)
#Filter out all movies with fewer than 36 ratings
movies = all_movies[all_movies['ratings_count'] > 35]
len(movies)

## Add new columns to movie dataframe

- Add column for release year

In [None]:
def release_year(row):
    """Takes a row that has the release year in the title and returns the release year"""
    title = row['title']
    title = title.strip()
    if '(' in title:
        if title[-6] + title[-1] == '()' and title[-5] in '12':
            # makes year an integer
            year = int(title[-5:-1])
            newtitle = title[:-7]
            return year,newtitle
        else:
            return np.NaN,title
    else:
        return np.NaN,title

# Create 2 new columns in movies dataframe: the release year and a shortened title 
movies['release_year'] = movies.apply(lambda x: release_year(x)[0], axis=1)
movies['short_title'] = movies.apply(lambda x: release_year(x)[1], axis=1)

- Split genre column into list

In [None]:
def genre_clean(row):
    """Takes a row with a column 'genres' w/ pipe seperators and returns a list of each item."""
    x = row['genres'].split('|')
    return x

# Add 'genre_clean' column that contains an alphabetical list of the movie's genres.
# I have no idea why this has to run twice (once with the [0] and then without) but it's the only way I can make it work.
movies['genre_clean'] = movies.apply(lambda x: genre_clean(x)[0], axis=1)
movies['genre_clean'] = movies.apply(lambda x: genre_clean(x), axis=1)

## Wikipedia dataframe

- clean Wikipedia list titles

In [None]:
def move_the(row):
    """Takes a row with column 'Title' and returns title in the same MovieLens format (Title, The)"""
    if row['Title'][0:4] == 'The ':
        beginning = row['Title'][4:]
        end = ', The'
        return beginning + end
    else:
        return row['Title']

# Add 'short_title' column to wiki dataframe (to match movies df col title)
wiki_list['short_title'] = wiki_list.apply(lambda x: move_the(x), axis=1)

- clean release year

In [None]:
def fix_year(row):
    """Takes a row with column 'Year' and returns the year as a float.
       If the year is a range of years, such as for a series, it returns
       the start year."""
    year = row['Year']
    if len(year) > 4 and year[0] in '12':
        return float(year[:4])
    elif len(year) == 4 and year[0] in '12':
        return float(year)
    else:
        return np.NaN

wiki_list['release_year'] = wiki_list.apply(lambda x: fix_year(x), axis=1)

- add 'iscult' column to movies dataframe

In [None]:
# Add a column to mark all cult movies as True
wiki_list['iscult'] = True
# Create a new dataframe that only includes the title, release year, and iscult bool.
wiki = wiki_list.iloc[:, [3,4,5]]
# Left merge onto movies so that all movies that match both title and year are 'True' in iscult column.
movies = movies.merge(wiki, how='left', on=['short_title', 'release_year'])

## Links dataframe

- clean imdbId and add 'isoscars' column to movies dataframe

In [None]:
# Function to format the imdbId to match the imdb data in 'oscars'
def fix_imdb(row):
    return 'tt' + row['imdbId']

# Apply function to links dataframe.
links['imdbId'] = links.apply(lambda x: fix_imdb(x), axis=1)

# Merge movies dataframe with links dataframe to get imdbId.
movies = movies.merge(links, on='movieId')
# If row in movies matches row in oscars df, 'isoscars' column equals True.
movies['isoscar'] = movies['imdbId'].isin(oscars['Const'])

## Merge with ratings dataframe

- merge and drop unneccessary columns

In [None]:
all_ratings = pd.merge(full_ratings, movies, on='movieId')
all_ratings.drop(['genres','tmdbId'],axis = 1,inplace = True)

## Filter all-ratings

In [None]:
ratings = all_ratings[all_ratings.ratings_count > 35]

## Take sample of ratings to work with

In [None]:
ratings_sample = ratings.sample(frac = 0.05)

#### Convert timestamp to datetime object

In [None]:
%timeit pd.Timestamp(times_only['timestamp'][0], unit='s')

In [None]:
times_only = pd.DataFrame(full_ratings['timestamp'])
times_only.head()

In [None]:
#def time_conversion(row):
    #"""Takes a row with a col titled 'timestamp' containing an int and converts to a datetime object."""
    #x = row['timestamp']
    #new = datetime.datetime.fromtimestamp(x).strftime("%m/%d/%Y, %H:%M:%S")
    #return datetime.datetime.strptime(new, "%m/%d/%Y, %H:%M:%S")

times_only['datetime'] = times_only.apply(lambda x: pd.Timestamp(x['timestamp'], unit='s'), axis=1)

In [None]:
ratings = pd.merge(ratings, times_only, on='timestamp')

## Save to CSV

In [40]:
#ratings.to_csv('ratings_clean.gzip', compression='gzip', index=False)

#### Creating Cult/Criterion/Oscars Columns

In [None]:
"""Team this is a function that I used to generate a column in the tag data frame indicating whether or not
a scored tag was ever applied by a user. I am including this in our collab group because I think there are clear
applications for creating similar columns in the movie database. Instead of joins on movie titles etc, it 
seems like we can use cult/criterion/oscar dfs as references to assign a boolean designation to each movie in
our movie df. Happy to explain more on slack etc."""

#second improved and working verion of top g!
def top_g(x):
    
    try: 
        #first thing is to create the user tag, have to place in a try block because
        #some movies do not have user tags. So you are assigning a boolean series
        #based on whether users tagged movies with a scored tag
        y = x.assign(user_tag = x.tag.isin(gp_tags.get_group(x.movieId.iloc[0]).tag))
        
    except:
        #if you hit a key error, aka no tags, then simply assign a false boolean series
        y = x.assign(user_tag = [False]*len(x.index))
        
    y = y.assign(ut_tstamp = [np.nan]*len(x.index))
    
    y = y.sort_values(by = 'relevance', ascending = False).iloc[0:100]
    
    return y

### Tag Work

#### Tag Cleaning

In [None]:
#load all the other relevant databases
tags = pd.read_csv('./data/tags.csv')
g_tags = pd.read_csv('./data/genome-tags.csv')
g_scores = pd.read_csv('./data/genome-scores.csv')
m_samp = pd.merge(r_samp_1,movies, on='movieId')
gp_tags = tags.groupby('movieId')
g_score_m = pd.merge(g_scores,g_tags, on='tagId')
#now group scores by movieId, and get the first group to work as a sample
#sort g score merge by movie id may relieve the weird error I am having later
g_score_m = g_score_m.sort_values(by = 'movieId')
g_score_m = g_score_m.reset_index(level = 0, drop = True)
gs_gp = g_score_m.groupby(g_score_m.movieId)

#second improved and working verion of top g!
def top_g2(x):
    
    try: 
        #first thing is to create the user tag, have to place in a try block because
        #some movies do not have user tags. So you are assigning a boolean series
        #based on whether users tagged movies with a scored tag
        y = x.assign(user_tag = x.tag.isin(gp_tags.get_group(x.movieId.iloc[0]).tag))
        
    except:
        #if you hit a key error, aka no tags, then simply assign a false boolean series
        y = x.assign(user_tag = [False]*len(x.index))
        
    #y = y.assign(ut_tstamp = [np.nan]*len(x.index))
    
    y['ut_tstamp'] = y.apply(lambda x : tag_tstamp2(x),axis = 1)
    
    y = y.sort_values(by = 'relevance', ascending = False).iloc[0:25]
    
    return y

def tag_tstamp2(row):
    
    time_set = set()
    
    #cnt = row.index[0]
    
    if row.user_tag:
        ttimes = gp_tags.get_group(row.movieId)\
            .timestamp[gp_tags.get_group(row.movieId).tag == row.tag] 
        for t in ttimes:
            time_set.add(t)
        
        time_set = np.array(time_set)
        
    else: 
        
        time_set = np.nan
        
    
    return time_set

gs_gp4 = gs_gp.apply(top_g2)
gs_gp4 = gs_gp4.reset_index(level = [0,1], drop = True)
gs_gp4 = gs_gp4.groupby(gs_gp4.movieId)

gs_gp4 = gs_gp.apply(top_g2)
gs_gp4 = gs_gp4.reset_index(level = [0,1], drop = True)
gs_gp4.to_csv('tag_upd_25.csv')


#### Tag Analysis

In [None]:
#original ingest

tags = pd.read_csv('./tag_upd_25.csv')
tags = tags.drop(labels = 'Unnamed: 0',axis=1)
mov_tags = movies.merge(tags, how = 'left', on = 'movieId')

#group by data frames
cults = mov_tags[mov_tags.iscult == True]
oscar = mov_tags[mov_tags.isoscar == True]
gp_cults = cults.groupby(cults.movieId)
gp_oscars = oscars.groupby(oscars.movieId)
gp_cult_tags = cults.groupby(cults.tag)
gp_oscars_tags = cults.groupby(oscars.tag)

#relevant cult tags and plot
cnt_and_rel = gp_cult_tags.describe()['relevance'].sort_values(by = 'count', ascending = False)
cnt_and_rel['prop'] = cnt_and_rel['count']/1184
c_c_a_2r = cnt_and_rel.iloc[0:19]

fig = plt.figure()
fig,ax = plt.subplots(figsize = (10,8))
ax.scatter(cnt_and_rel['mean'], cnt_and_rel['prop'],color = ['#F4D7A4'])
ax.scatter(c_c_a_2r['mean'], c_c_a_2r['prop'])
ax.set_title('Tags Plotted by Count and Mean Relevance Score:\nCult Films')
plt.xlabel('Mean Relevance Score')
plt.ylabel('Proportion of Movies with Tag')
fig.savefig('cnt_rel_cult.png')

#relevant oscar tags and plot
oscar_cnt_and_rel = gp_oscar.describe()['relevance'].sort_values(by = 'count', ascending = False)
oscar_cnt_and_rel['prop'] = oscar_cnt_and_rel['count']/470
oscar_cnt_and_rel
o_c_a_2r = oscar_cnt_and_rel.iloc[0:23]

fig = plt.figure()
fig,ax = plt.subplots(figsize = (10,8))
ax.scatter(oscar_cnt_and_rel['mean'], oscar_cnt_and_rel['prop'],color = ['#F4D7A4'])
ax.scatter(o_c_a_2r['mean'], o_c_a_2r['prop'])
ax.set_title('Tags Plotted by Count and Mean Relevance Score:\nOscar Films')
plt.xlabel('Mean Relevance Score')
plt.ylabel('Proportion of Movies with Tag')
fig.savefig('cnt_rel_osc.png')

#cult and oscars combined and plot
both = mov_tags[(mov_tags['isoscar'] == True) & (mov_tags['iscult'] == True)]
gp_both = both.groupby(both.tag)
gp_mov_tags = mov_tags.groupby(mov_tags.tag)
g_c_a_r = gp_both.describe()['relevance'].sort_values(by = 'count', ascending = False)
g_c_a_2r = g_c_a_r.iloc[0:6]

fig = plt.figure()
fig,ax = plt.subplots(figsize = (10,8))
ax.scatter(g_c_a_r['mean'], g_c_a_r['count'],color = ['#F4D7A4'])
ax.scatter(g_c_a_2r['mean'], g_c_a_2r['count'])
ax.set_title('Tags Plotted by Count and Mean Relevance Score:\nCult and Oscar Films')
plt.xlabel('Mean Relevance Score')
plt.ylabel('Count of Movies with Tag')
fig.savefig('both.png')

#all movies
m_c_a_r = gp_mov_tags.describe()['relevance'].sort_values(by = 'count', ascending = False)
m_c_a_r['prop'] = m_c_a_r['count']/62423
m_c_a_2r = m_c_a_r.iloc[0:9]

fig = plt.figure()
fig,ax = plt.subplots(figsize = (10,8))
ax.scatter(m_c_a_r['mean'], m_c_a_r['prop'],color = ['#F4D7A4'])
ax.scatter(m_c_a_2r['mean'], m_c_a_2r['prop'])
ax.set_title('Tags Plotted by Count and Mean Relevance Score:\nAll Films')
plt.xlabel('Mean Relevance Score')
plt.ylabel('Proportion of Movies with Tag')
fig.savefig('cnt_rel_all.png')






