In [3]:
import requests, os, shutil
from tqdm.notebook import tqdm

In [5]:
if 'ml-32.zip' not in os.listdir('./') and 'Dataset' not in os.listdir('./'):
    with requests.get('https://files.grouplens.org/datasets/movielens/ml-32m.zip', stream = True) as response:
        total_size = int(response.headers.get('content-length', 0))
        with tqdm(total = total_size, desc = 'Downloading', unit_scale = True, unit='iB') as progress_bar:
            with open('./ml-32.zip', 'wb') as file:
                for chunk in response.iter_content(chunk_size = 1024):
                    if chunk:
                        file.write(chunk)
                        progress_bar.update(len(chunk))
else:
    print("File already downloaded")

File already downloaded


In [6]:
if 'Dataset' not in os.listdir('./'):
    shutil.unpack_archive('./ml-32.zip', './Dataset')
    os.remove('./ml-32.zip')
print("File unpacked")

File unpacked


In [4]:
import pandas as pd, numpy as np, requests
import plotly.express as px
import plotly.figure_factory as ff
import threading
from datetime import datetime
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
from matplotlib import pyplot as plt
import re
tqdm.pandas()

In [46]:
files = os.listdir('./Dataset/ml-32m')

In [48]:
files = tuple(filter(lambda file: file.endswith('.csv'), files))

In [50]:
files

('links.csv', 'movies.csv', 'ratings.csv', 'tags.csv')

In [236]:
dfList = {}
for file in tqdm(files, desc='Loading data'):
    dfList[file.replace('.csv', '')] = pd.read_csv(f'./Dataset/ml-32m/{file}')

Loading data:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
dfList['links'].head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [11]:
dfList['movies'].head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [12]:
dfList['ratings'].head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228


In [16]:
dfList['tags'].head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297


In [21]:
for key in dfList:
    print(f'{key}:\n{dfList[key].isna().sum()}')

links:
movieId      0
imdbId       0
tmdbId     124
dtype: int64
movies:
movieId    0
title      0
genres     0
dtype: int64
ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
tags:
userId        0
movieId       0
tag          17
timestamp     0
dtype: int64


In [22]:
def getImdbID(tmdbID):
    url = f"https://api.themoviedb.org/3/movie/{tmdbID}/external_ids"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI2MDljNGEyYTJlMzA3M2M3ODFlMGQ3ZGFmNTRjNDk0OCIsIm5iZiI6MTczMzk0NDAyMC40LCJzdWIiOiI2NzU5ZTJkNGZmYmMwZmNjM2Q0NGMyNWQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.W0VK7A5XTNlPxwACwYKTfVDP2raNT_Dhe6dL-07qo94"
    }

    response = requests.get(url, headers=headers)
    if response.ok:
        return response.json()['imdb_id']
    else:
        return np.nan

In [23]:
def getTmdbID(imdbID):
    url = f"https://api.themoviedb.org/3/movie/{imdbID}/external_ids"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI2MDljNGEyYTJlMzA3M2M3ODFlMGQ3ZGFmNTRjNDk0OCIsIm5iZiI6MTczMzk0NDAyMC40LCJzdWIiOiI2NzU5ZTJkNGZmYmMwZmNjM2Q0NGMyNWQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.W0VK7A5XTNlPxwACwYKTfVDP2raNT_Dhe6dL-07qo94"
    }

    response = requests.get(url, headers=headers)
    if response.ok:
        return response.json()['id']
    else:
        return np.nan

In [237]:
dfLinks = dfList['links'].copy()
dfMovies = dfList['movies'].copy()
dfRatings = dfList['ratings'].copy()
dfTags = dfList['tags'].copy()
del dfList

In [25]:
indices = dfLinks[dfLinks['tmdbId'].isna()].index

In [26]:
def fill_tmdb_id(row):
    if pd.isna(row['tmdbId']):
        row['tmdbId'] = getTmdbID(row['imdbId'])  # Assign value if tmdbId is NaN
    return row  # Always return the full row

In [27]:
dfLinks = dfLinks.progress_apply(fill_tmdb_id, axis=1)

  0%|          | 0/87585 [00:00<?, ?it/s]

In [28]:
dfLinks.isna().sum()

movieId     0
imdbId      0
tmdbId     75
dtype: int64

In [120]:
def extractGenre(df):
    genre = []
    for x in df['genres']:
        temp = x.split('|')
        for i in temp:
            if i.strip() not in genre:
                genre.append(i.strip())
    return genre

In [122]:
genre = extractGenre(dfMovies)

In [124]:
dfMovies[genre] = 0

In [126]:
def split(row):
    temp = row['genres'].split('|')
    for i in temp:
        row[i.strip()] = 1
    return row

In [128]:
dfMovies = dfMovies.progress_apply(split, axis = 1)

  0%|          | 0/86471 [00:00<?, ?it/s]

In [130]:
dfMovies.head(2)

Unnamed: 0,movieId,title,original_language,popularity,runtime,release_date,poster_path,weightedVoteAverage,Adventure,Animation,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),en,114.879,81.0,1995-10-30,https://image.tmdb.org/t/p/w500/uXDfjJbdP4ijW5...,7.997425,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),en,14.291,104.0,1995-12-15,https://image.tmdb.org/t/p/w500/vgpXmVaVyUL7GG...,7.238083,1,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
dfRatings.rating.unique()

array([4. , 1. , 2. , 5. , 3. , 3.5, 0.5, 4.5, 2.5, 1.5])

In [42]:
# dfRatings['timestamp'] = pd.to_datetime(dfRatings['timestamp'], unit='s')

In [43]:
dfRatings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228


In [44]:
# dfTags['timestamp'] = pd.to_datetime(dfTags['timestamp'], unit='s')

In [240]:
dfTags.dropna(inplace = True, ignore_index=True)

In [446]:
def extractTag(df):
    tag = []
    for x in df['tag']:
        temp = x.split(' ')
        for i in temp:
            if i.strip() not in tag:
                tag.append(i.strip())
    return tag

In [448]:
# extractTag(dfTags)

In [450]:
def moreMovieDetails(imdbId, tmdbId):
    url = f"https://api.themoviedb.org/3/movie/{'tt'+str(imdbId)}?language=en-US"
    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI2MDljNGEyYTJlMzA3M2M3ODFlMGQ3ZGFmNTRjNDk0OCIsIm5iZiI6MTczMzk0NDAyMC40LCJzdWIiOiI2NzU5ZTJkNGZmYmMwZmNjM2Q0NGMyNWQiLCJzY29wZXMiOlsiYXBpX3JlYWQiXSwidmVyc2lvbiI6MX0.W0VK7A5XTNlPxwACwYKTfVDP2raNT_Dhe6dL-07qo94"
    }
    
    response = requests.get(url, headers=headers)
    if not response.ok:
        url = f"https://api.themoviedb.org/3/movie/{str(tmdbId)}?language=en-US"
        response = requests.get(url, headers=headers)

    if response.ok:
        return pd.Series({'original_language': response.json()['original_language'] if 'original_language' in response.json() else np.nan,
                              'vote_average': response.json()['vote_average'] if 'vote_average' in response.json() else np.nan,
                              'vote_count': response.json()['vote_count'] if 'vote_count' in response.json() else np.nan,
                              'popularity': response.json()['popularity'] if 'popularity' in response.json() else np.nan,
                              'runtime': response.json()['runtime'] if 'runtime' in response.json() else np.nan,
                              'release_date': response.json()['release_date'] if 'release_date' in response.json() else np.nan,
                              'poster_path': 'https://image.tmdb.org/t/p/w500' + response.json()['poster_path'] if response.json()['poster_path'] else np.nan})
    else:
        return pd.Series({'original_language': np.nan,
                          'vote_average': np.nan,
                          'vote_count': np.nan,
                          'popularity': np.nan,
                          'runtime': np.nan,
                          'release_date': np.nan,
                          'poster_path': np.nan})

In [452]:

def process_chunk(chunk):
    """Process a chunk of the dataframe."""
    return chunk.progress_apply(lambda x: moreMovieDetails(x['imdbId'], x['tmdbId']), axis=1)

def process_in_background():
    """Function to process the DataFrame in the background."""
    # Number of threads to use
    num_threads = 12

    # Split the DataFrame into chunks for each thread
    chunks = np.array_split(dfLinks, num_threads)

    # Use ThreadPoolExecutor to process chunks in parallel
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(process_chunk, chunks))

    # Combine the results back into a single DataFrame
    global temp
    temp = pd.concat(results)
    print('Finished')

# Start the background thread
background_thread = threading.Thread(target=process_in_background)
background_thread.start()


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



In [454]:
dfLinks = pd.concat([dfLinks, temp], axis = 1)

In [179]:
dfLinks.shape

(87585, 10)

In [183]:
dfLinks[dfLinks.drop(['movieId', 'imdbId', 'tmdbId'], axis = 1).isna().any(axis = 1)]

Unnamed: 0,movieId,imdbId,tmdbId,original_language,vote_average,vote_count,popularity,runtime,release_date,poster_path
137,139.0,114618.0,124639.0,en,0.0,0.0,0.423,122.0,1995-08-01,
399,404.0,109339.0,316098.0,en,3.0,2.0,2.544,115.0,1995-03-01,
596,604.0,115978.0,538286.0,,,,,,,
684,699.0,103095.0,277270.0,en,9.0,2.0,1.177,0.0,1991-01-01,
706,721.0,114103.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
87386,291499.0,12836100.0,655817.0,en,0.0,0.0,0.934,6.0,2019-11-26,
87408,291699.0,152800.0,68925.0,en,6.6,8.0,0.655,12.0,1998-01-01,
87437,291791.0,17592600.0,1136321.0,,,,,,,
87520,292143.0,6216336.0,426654.0,en,5.0,2.0,3.749,86.0,2016-11-05,


In [221]:
movieIds = dfLinks[dfLinks.drop(['movieId', 'imdbId', 'tmdbId'], axis = 1).isna().all(axis = 1)].movieId

In [223]:
len(movieIds)/dfLinks.shape[0]*100

1.2719072900610835

In [227]:
dfLinks.drop(dfLinks[dfLinks['movieId'].isin(movieIds)].index, inplace = True)
dfLinks.reset_index(drop = True, inplace = True)

In [241]:
dfTags.drop(dfTags[dfTags['movieId'].isin(movieIds)].index, inplace = True)
dfTags.reset_index(drop = True, inplace = True)

In [263]:
dfRatings.drop(dfRatings[dfRatings['movieId'].isin(movieIds)].index, inplace = True)
dfRatings.reset_index(drop = True, inplace = True)

In [267]:
dfMovies.drop(dfMovies[dfMovies['movieId'].isin(movieIds)].index, inplace = True)
dfMovies.reset_index(drop = True, inplace = True)

In [271]:
dfLinks.drop(['imdbId', 'tmdbId'], inplace = True, axis = 1)

In [54]:
# dfLinks.to_pickle('Links.pkl')
# dfTags.to_pickle('Tags.pkl')
# dfRatings.to_pickle('Ratings.pkl')
# dfMovies.to_pickle('Movies.pkl')

In [170]:
dfLinks = pd.read_pickle('Links.pkl')
dfTags = pd.read_pickle('Tags.pkl')
dfRatings = pd.read_pickle('Ratings.pkl')
dfMovies = pd.read_pickle('Movies.pkl')

In [30]:
columns = dfMovies.drop(['movieId', 'title', 'original_language', 'popularity', 'runtime', 'release_date', 'poster_path', 'weightedVoteAverage'], axis = 1).columns

In [32]:
def func(row):
    text = ''
    for col in columns:
        if row[col]:
            text+= col+'|'
    return text[0:-1]

In [176]:
fig = px.bar(dfMovies.drop(['movieId', 'title'], axis = 1,).sum(),color_discrete_sequence=['#bf0000'], title="Movies Genre", template = 'plotly_dark', color=dfMovies.drop(['movieId', 'title'], axis = 1).columns).update_layout(showlegend=False, xaxis={'categoryorder': 'total descending'}, xaxis_title="Genres", yaxis_title="Movies count")

In [178]:
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot background
)
pio.write_image(fig, 'Movie.png', scale = 20)

In [34]:
# dfMovies['genres'] = dfMovies.apply(func, axis = 1)

In [80]:
dfMovies = dfMovies.merge(dfLinks)

In [154]:
fig = ff.create_distplot([dfRatings['rating']], ['rating'], show_hist=False, show_rug=False)

In [160]:
for i, trace in enumerate(fig.data):
    trace.line.color = '#bf0000'

In [164]:
fig.update_layout(
    template='plotly_dark',
    title="Distribution Plot of ratings",
    xaxis_title="Rating",
    yaxis_title="Density",
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot background
)
pio.write_image(fig, 'Dist.png', scale = 20)

In [22]:
oldStamp = -1
interactionFreq = list()
for stamp in dfRatings['timestamp'].sort_values():
    if oldStamp != -1:
        interactionFreq.append(stamp - oldStamp)
    oldStamp = stamp
np.mean(interactionFreq)

28.429175850942766

In [182]:
dfRatings['dateTime'] = pd.to_datetime(dfRatings.timestamp, unit='s')

In [183]:
dfRatings['year'] = dfRatings['dateTime'].dt.year
dfRatings['month'] = dfRatings['dateTime'].dt.month
dfRatings['dayName'] = dfRatings['dateTime'].dt.day_name()
dfRatings['day'] = dfRatings['dateTime'].dt.day

In [184]:
temp = dfRatings.groupby(by=['year', 'month', 'day'], sort=['year', 'month', 'day'])['rating'].mean().to_frame().reset_index()

In [185]:
temp['date'] = pd.to_datetime((temp['year'].astype(str) + '-' + temp['month'].astype(str) + '-' + temp['day'].astype(str)))

In [208]:
fig = px.line(temp, x = 'date', y = 'rating', color_discrete_sequence=['#bf0000'], template = 'plotly_dark', title = 'Average rating of movies across days')

In [274]:
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot background
)
pio.write_image(fig, 'Lang.png', scale = 20)

In [272]:
fig = px.histogram(dfMovies, x = 'original_language', template = 'plotly_dark',color_discrete_sequence=["#bf0000"], title = 'Number of Movies per language').update_layout(xaxis={'categoryorder': 'total descending'}, xaxis_title="Original language", yaxis_title="Number of movies")

In [82]:
dfMovies.describe()

Unnamed: 0,movieId,vote_average,vote_count,popularity,runtime
count,86471.0,86471.0,86471.0,86471.0,86471.0
mean,157515.38872,5.845355,256.439454,6.223589,91.243596
std,79242.213105,1.450591,1220.106975,10.587636,53.508834
min,1.0,0.0,0.0,0.0,0.0
25%,112289.0,5.207,8.0,1.743,83.0
50%,165633.0,6.062,22.0,3.507,93.0
75%,213314.0,6.753,78.0,7.136,105.0
max,292757.0,10.0,36638.0,871.82,12480.0


In [84]:
m = dfMovies['vote_count'].quantile(0.50)
c = dfMovies['vote_average'].mean()

In [86]:
weightedVote = lambda row: (row.vote_count/(row.vote_count+m))*row.vote_average + (m/(row.vote_count+m))*c

In [88]:
dfMovies['weightedVoteAverage'] = dfMovies.progress_apply(weightedVote, axis = 1)

  0%|          | 0/86471 [00:00<?, ?it/s]

In [90]:
dfMovies.drop(['vote_average', 'vote_count'], axis = 'columns', inplace = True)

In [168]:
dfRatings.sort_values(['dateTime'], )

Unnamed: 0,userId,movieId,rating,timestamp,weightedRating,dateTime,year,month,dayName,day
3984668,25062,1176,4.0,789652004,0.000000,1995-01-09 11:46:44,1995,1,Monday,9
4945606,30917,21,3.0,789652009,0.000000,1995-01-09 11:46:49,1995,1,Monday,9
4945612,30917,47,5.0,789652009,0.000000,1995-01-09 11:46:49,1995,1,Monday,9
4945655,30917,1079,3.0,789652009,0.000000,1995-01-09 11:46:49,1995,1,Monday,9
6205690,38835,17,5.0,822873600,0.000000,1996-01-29 00:00:00,1996,1,Monday,29
...,...,...,...,...,...,...,...,...,...,...
8642459,54315,112818,3.0,1697163923,2.999222,2023-10-13 02:25:23,2023,10,Friday,13
8642619,54315,197709,3.5,1697163970,3.499283,2023-10-13 02:26:10,2023,10,Friday,13
8642613,54315,194947,2.5,1697164060,2.499748,2023-10-13 02:27:40,2023,10,Friday,13
8642436,54315,104457,2.5,1697164098,2.499858,2023-10-13 02:28:18,2023,10,Friday,13


In [8]:
mx = dfRatings.timestamp.max()
divisor = 60*60*24
dfRatings['weightedRating'] = dfRatings[['rating', 'timestamp']].progress_apply(lambda row: row.rating * (np.exp(-0.1*(mx-row.timestamp)/divisor)), axis = 1)

  0%|          | 0/31921860 [00:00<?, ?it/s]

In [252]:
fig = px.histogram(dfRatings[dfRatings['movieId'] == 1],color_discrete_sequence=["#bf0000"], x = dfRatings[dfRatings['movieId'] == 1].month.astype(str), y = 'rating', histfunc='avg', template = 'plotly_dark', title = f'Average rating of {dfMovies[dfMovies['movieId'] == 1].title.iloc[0]}').update_layout(xaxis_title='Months', xaxis={'categoryorder':'array', 'categoryarray': [1,2,3,4,5,6,7,8,9,10,11,12]})

In [256]:
fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
    plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot background
)
pio.write_image(fig, 'story.png', scale = 20)

In [246]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [198]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [242]:
dfTags['tag'] = dfTags.tag.str.lower()

In [244]:
dfTags['tag'].sort_values().tolist()

['    the asylum',
 ' alex roe',
 ' alexander skarsgård',
 ' andre dussollier',
 ' andre dussollier',
 ' antebellum south',
 ' antebellum south',
 ' beauty parlor',
 ' beauty parlor',
 ' breakup',
 ' comedy ',
 ' criminal psychology',
 ' denis ménochet',
 ' difficult to find it',
 ' disney ripoff for cash',
 ' disney ripoff for cash',
 ' disney ripoff for cash',
 ' disney ripoff for cash',
 ' ecological dystopian thriller film',
 ' filmed in chile',
 ' filmes antigos',
 ' filmes antigos',
 ' filmes antigos ',
 ' filmes antigos ',
 ' fluorescent bulb',
 ' human behavior',
 ' jean rochefort',
 ' jill ireland',
 ' julien bertheau',
 ' kartik aaryan',
 ' kriti sanon',
 ' lambert wilson',
 ' laurel canyon',
 ' lauren lapkus',
 ' lauren lapkus',
 ' lgbtiqa+',
 ' masami nagasawa',
 ' mental illness',
 ' mental illness',
 ' mental illness',
 ' mental illness',
 ' michael lonsdale',
 ' music',
 ' n 2022 with family',
 ' nathalie baye',
 ' nostalgia done right',
 " o'shea jackson jr.",
 ' pair '

In [245]:
dfTags['tag'] = dfTags['tag'].str.replace(r"[^\w\s]", '', regex=True)

In [246]:
dfTags.drop(dfTags['tag'][dfTags['tag'].str.len() == 0].index, inplace = True)

In [247]:
dfTags['tag'] = dfTags['tag'].str.strip()

In [252]:
dfTags.drop(dfTags[dfTags['tag'].str.isdigit()].index, inplace = True)

In [254]:
dfTags['tag'].sort_values().unique().tolist()

['',
 '0 gravity',
 '0 stars',
 '007 series',
 '007like',
 '06 oscar nominated best movie',
 '06 oscar nominated best movie  animation',
 '06 oscar nominated best movie  foreign language',
 '1 million bc',
 '1 month later',
 '1 prediction',
 '1 year later',
 '10 downing street',
 '10 downing street london',
 '10 minutes of spinning crap',
 '10 year old',
 '10 year old boy',
 '10 year old girl',
 '10 years later',
 '100 acre wood',
 '100 brain power',
 '100 essential female performances',
 '100 greatest movies',
 '100 minutes',
 '100 predictable',
 '100 years',
 '1000 mile adventure',
 '1000 year old',
 '10000 bc',
 '1000000 prize',
 '1000th rating',
 '1001 night tales',
 '1001 nights',
 '1001 year old',
 '1001th birthday',
 '100th century bc',
 '101st airborne',
 '103 year old',
 '108 year old',
 '10s',
 '10th century',
 '11 oscars',
 '11 year old',
 '11 year old boy',
 '11 year old explicit violence',
 '11 year old girl',
 '115 gev',
 '1150s',
 '1160s',
 '1170s',
 '1180s',
 '1190s',
 

In [255]:
stop_words = stopwords.words()
dfTags['tag'] = dfTags['tag'].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

  0%|          | 0/1996082 [00:00<?, ?it/s]

In [257]:
dfTags = dfTags[dfTags['tag'].str.contains(r'[a-zA-Z]', regex=True, na=False)]

In [300]:
dfTags['tag'].sort_values().unique().tolist()

['0 gravity',
 '0 star',
 '007 series',
 '007like',
 '06 oscar nominated movie',
 '06 oscar nominated movie animation',
 '06 oscar nominated movie foreign language',
 '1 cobra helicopter',
 '1 hr 28 minute',
 '1 million',
 '1 month',
 '1 prediction',
 '1 year',
 '10 downing street',
 '10 downing street london',
 '10 minute spinning crap',
 '10 minute unwatchable',
 '10 movie made',
 '10 thunderbolt aircraft',
 '10 year',
 '10 year boy',
 '10 year girl',
 '100 acre wood',
 '100 brain power',
 '100 essential female performance',
 '100 greatest movie',
 '100 minute',
 '100 predictable',
 '100 time bore',
 '100 year',
 '1000 mile adventure',
 '1000 year',
 '1000000 prize',
 '1000th rating',
 '1001 night',
 '1001 night tale',
 '1001 year',
 '1001th birthday',
 '100th century',
 '101st airborne',
 '103 year',
 '108 year',
 '10th century',
 '11 oscar',
 '11 year',
 '11 year boy',
 '11 year explicit violence',
 '11 year girl',
 '115 gev',
 '1150s',
 '1160s',
 '1170s',
 '1180s',
 '119 flying bo

In [262]:
lemmatizer = WordNetLemmatizer()

In [264]:
dfTags['tag'] = dfTags['tag'].progress_apply(lambda tag: ' '.join([lemmatizer.lemmatize(word) for word in tag.split()]))

  0%|          | 0/1978955 [00:00<?, ?it/s]

In [270]:
dfTags['tag'] = dfTags['tag'].str.replace('_', ' ').str.strip()

In [None]:
dfTags.drop(dfTags[dfTags['tag'].str.isdigit()].index, inplace = True)

In [298]:
dfTags['tag'] = dfTags['tag'].progress_apply(lambda x: " ".join(dict.fromkeys(x.split())))

  0%|          | 0/1978030 [00:00<?, ?it/s]

In [356]:
rareTags = dfTags['tag'].value_counts(ascending = True)[dfTags['tag'].value_counts(ascending = True) <= 2].index

In [360]:
dfTags.drop(dfTags[dfTags['tag'].isin(rareTags)].index, inplace = True)

In [362]:
dfTags['tag'].sort_values().unique().tolist()

['0 gravity',
 '007 series',
 '06 oscar nominated movie',
 '06 oscar nominated movie animation',
 '1 prediction',
 '1 year',
 '10 minute unwatchable',
 '10 movie made',
 '10 year',
 '100 essential female performance',
 '100 greatest movie',
 '1001 night',
 '10th century',
 '11 year',
 '11 year explicit violence',
 '11 year girl',
 '1191aspect ratio',
 '11th century',
 '12 step program',
 '12 year',
 '12 year boy',
 '12 year girl',
 '12 year making',
 '12th century',
 '13 year',
 '13 year boy',
 '13 year girl',
 '130 hercules',
 '1300s',
 '1331 aspect ratio',
 '13th century',
 '14 year',
 '14 year boy',
 '14 year girl',
 '1400s',
 '14th century',
 '15 hour',
 '15 year',
 '15 year girl',
 '1500s',
 '1540s',
 '15th century',
 '16 year',
 '16 year girl',
 '1600s',
 '1610s',
 '1640s',
 '1680s',
 '16mm',
 '16th birthday',
 '16th century',
 '17 year',
 '17 year girl',
 '1700s',
 '1770s',
 '1780s',
 '1790s',
 '17th century',
 '18 wheeler',
 '18 year',
 '18 year girl',
 '1800s',
 '1810s',
 '182

In [8]:
dfTags['tag'].nunique()

41302

In [72]:
dfTags[dfTags['tag'] == 'scientific'].index

Unnamed: 0,userId,movieId,tag,timestamp
1154,303,134130,scientific,1551554994
15243,2508,134130,scientific,1621000747
16505,2818,134130,scientific,1627646451
21286,3393,134130,scientific,1561842827
37325,5643,134130,scientific,1434464365
...,...,...,...,...
1939406,156214,134130,scientific,1523815269
1983818,159555,134130,scientific,1454884853
1991779,160536,134130,scientific,1450841373
1993695,160876,134130,scientific,1606573593


In [68]:
dfMovies.columns = dfMovies.columns[:-1].tolist() + ['Unknown']

In [11]:
dfLinks.to_pickle('LinksProcessed.pkl')
dfTags.to_pickle('TagsProcessed.pkl')
dfRatings.to_pickle('RatingsProcessed.pkl')
dfMovies.to_pickle('MoviesProcessed.pkl')