In [1]:
import pandas as pd
import numpy as np

# Creating dataframe

In [None]:
df=pd.read_csv('data.tsv', sep='\t')
print(df.shape)
print(df.head(100))

# Cleaning titles

In [4]:
df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [5]:
df=df[~df["titleType"].str.contains('short')]
df=df[~df["titleType"].str.contains('tvEpisode')]
df=df[~df["titleType"].str.contains('tvSpecial')]
df=df[~df["titleType"].str.contains('video')]
df=df[~df["titleType"].str.contains('videoGame')]
df=df[~df["titleType"].str.contains('tvPilot')]

In [6]:
df['titleType'].unique()

array(['movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvMiniSeries'],
      dtype=object)

# Cleaning ratings

In [None]:
df_ratings=pd.read_csv('dataratings.tsv', sep='\t', header=0, quoting=3)
print(df_ratings.shape)
print(df_ratings.head(100))

In [12]:
df=df.drop(columns=['isAdult','endYear', 'originalTitle'])

In [13]:
df=pd.merge(df, df_ratings, on='tconst')

In [19]:
df=df.dropna(subset=['startYear', 'runtimeMinutes', 'genres', 'primaryTitle'])

# Splitting genres

In [None]:
genre_dummies = df['genres'].str.get_dummies(sep=',')

In [33]:
df = pd.concat([df, genre_dummies], axis=1)

In [35]:
df.drop(columns=['genres'], inplace=True)

In [36]:
len(df[df['numVotes']<100])

205231

# Deleting rows where number of votes are less than 100

In [37]:
df = df[df['numVotes'] >= 100]

# API Calls from TMDb

In [6]:
import requests
import pandas as pd
import warnings

# To ignore all warnings (not recommended)
warnings.filterwarnings("ignore")

# Define your API_KEY securely
API_KEY = 'fa9272e4589b7ec38b742c278e16a2f0'

# Initialize lists to store data before updating the DataFrame

values=pd.DataFrame()

for index, row in df.iterrows():
    id = row['tconst']
    query = f'https://api.themoviedb.org/3/find/{id}?api_key={API_KEY}&language=en-US&external_source=imdb_id'

    try:
        response = requests.get(query)
        response.raise_for_status()  # Raise an exception for HTTP errors

        movie = response.json()
        if 'tv_results' in movie and len(movie['tv_results']) > 0:
            result = movie['tv_results'][0]
        elif 'movie_results' in movie and len(movie['movie_results']) > 0:
            result = movie['movie_results'][0]
        else:
            # Handle the case where there are no results
            result = None
        
        if result:
            overview = result.get('overview', '')
            tmdb_vote_avg = result.get('vote_average', 0)
            poster_path = result.get('poster_path', '')

            overview = overview[:400]  # Limit overview text to 400 characters (adjust as needed)

            values = values.append({"overview":overview, "tmdbVoteAvg": tmdb_vote_avg, "poster": poster_path}, ignore_index=True)
            #overviews.append(overview)
            #tmdb_vote_avgs.append(tmdb_vote_avg)
            #posters.append(f"https://www.themoviedb.org/t/p/w600_and_h900_bestv2/{poster_path}")
        else:
            # Handle the case where there are no valid results
            values = values.append({"overview":'', "tmdbVoteAvg": np.nan, "poster": ''}, ignore_index=True)

    except requests.exceptions.RequestException as e:
        print(f"Error making API request for {id}: {e}")

df = pd.concat([df, values], axis=1)


# Delete rows with empty overview, average tmdb vote, and poster

In [17]:
df=df[df['overview']!='']

In [34]:
df=df[df['tmdbVoteAvg']!=0]

Unnamed: 0,tconst,titleType,primaryTitle,startYear,runtimeMinutes,averageRating,numVotes,Action,Adult,Adventure,...,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western\r,overview,tmdbVoteAvg,poster


In [36]:
df=df[~df['poster'].isna()]

# Export dataframe

In [43]:
df.isnull().any(axis=1).any()

False

In [45]:
df.to_csv('base.tsv', sep="\t", index=False)

In [46]:
df.shape

(138763, 38)

In [None]:
df.to_csv('Lili.tsv', sep="\t", index=False)