In [3]:
import pandas as pd
import os
import numpy as np
import requests
import json

In [83]:
df = pd.read_csv('../src/combined_data_1.txt',header=None, names=['customer_id', 'rating', 'date'])
df=df.iloc[1000000:2000000,:]

In [84]:
chunk = df.copy()

In [85]:
chunk

Unnamed: 0,customer_id,rating,date
1000000,953667,4.0,2005-02-27
1000001,95084,2.0,2005-02-27
1000002,2024723,1.0,2005-02-28
1000003,386823,4.0,2005-03-01
1000004,999201,2.0,2005-03-03
...,...,...,...
1999995,719099,3.0,2005-12-04
1999996,2600755,5.0,2005-12-11
1999997,1143715,4.0,2005-12-18
1999998,555073,3.0,2005-12-18


In [86]:
chunk.reset_index(drop=True, inplace=True)

In [40]:
# Convert the "customer_id" column to string type
chunk["customer_id"] = chunk["customer_id"].astype(str)
# Find rows with movie ids
movie_id_rows = chunk[chunk["customer_id"].str.contains(":")].index

In [41]:
# If chunk starts with a customer rating row, use the previous movie id
if len(movie_id_rows) > 0 and movie_id_rows[0] != 0:
    movie_ids = [(prev_movie_id, (0, movie_id_rows[0]))]
else:
    movie_ids = []

In [42]:
# Create a list of tuples with movie id and the corresponding index range
for i in range(len(movie_id_rows) - 1):
    movie_id = chunk.at[movie_id_rows[i], "customer_id"][:-1]
    idx_range = (movie_id_rows[i] + 1, movie_id_rows[i + 1])
    movie_ids.append((movie_id, idx_range))

In [43]:
# Add last movie id and its index range
movie_id = chunk.at[movie_id_rows[-1], "customer_id"][:-1]
idx_range = (movie_id_rows[-1] + 1, len(chunk))
movie_ids.append((movie_id, idx_range))

# Store the last movie id for the next chunk
next_movie_id = movie_id


In [45]:
# Create a dataframe with movie ids, customer ids, ratings, and dates
data = []
for movie_id, (start, end) in movie_ids:
    customer_ratings = chunk.iloc[start:end].copy()
    customer_ratings["movie_id"] = movie_id
    data.append(customer_ratings)

processed_chunk = pd.concat(data, ignore_index=True)

In [46]:
processed_chunk

Unnamed: 0,customer_id,rating,date,movie_id
0,1488844,3.0,2005-09-06,1
1,822109,5.0,2005-05-13,1
2,885013,4.0,2005-10-19,1
3,30878,4.0,2005-12-26,1
4,823519,3.0,2004-05-03,1
...,...,...,...,...
999770,1196927,3.0,2005-02-15,225
999771,528854,5.0,2005-02-21,225
999772,962705,3.0,2005-02-22,225
999773,1299323,2.0,2005-02-24,225


In [47]:
next_movie_id

'225'

In [110]:
def process_chunk(chunk, prev_movie_id):
    # Convert the "customer_id" column to string type
    chunk.reset_index(drop=True, inplace=True)
    chunk["customer_id"] = chunk["customer_id"].astype(str)
    # Find rows with movie ids
    movie_id_rows = chunk[chunk["customer_id"].str.contains(":")].index
    # If chunk contains movie ids
    if len(movie_id_rows) > 0:
        # If chunk starts with a customer rating row, use the previous movie id
        if movie_id_rows[0] != 0:
            movie_ids = [(prev_movie_id, (0, movie_id_rows[0]))]
        else:
            movie_ids = []

        # Create a list of tuples with movie id and the corresponding index range
        for i in range(len(movie_id_rows) - 1):
            movie_id = chunk.at[movie_id_rows[i], "customer_id"].replace(":", "")
            idx_range = (movie_id_rows[i] + 1, movie_id_rows[i + 1])
            movie_ids.append((movie_id, idx_range))

        # Add last movie id and its index range
        movie_id = chunk.at[movie_id_rows[-1], "customer_id"][:-1]
        idx_range = (movie_id_rows[-1] + 1, len(chunk))
        movie_ids.append((movie_id, idx_range))

        # Store the last movie id for the next chunk
        next_movie_id = movie_id
    else:
        # If chunk does not contain movie ids, use the previous movie id
        movie_ids = [(prev_movie_id, (0, chunk.shape[0] - 1))]
        next_movie_id = prev_movie_id
    # Create a dataframe with movie ids, customer ids, ratings, and dates
    data = []
    for movie_id, (start, end) in movie_ids:
        customer_ratings = chunk.iloc[start:end].copy()
        customer_ratings["movie_id"] = int(movie_id)
        data.append(customer_ratings)

    processed_chunk = pd.concat(data, ignore_index=True)
    return processed_chunk, next_movie_id

def filter_data(df):
    agg_functions = ['count','mean']
    df_movie_summary = df.groupby('movie_id')['rating'].agg(agg_functions)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
    print('Movie minimum times of review: {}'.format(movie_benchmark))
    df_cust_summary = df.groupby('customer_id')['rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
    print(cust_benchmark)
    drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
    print('Customer minimum times of review: {}'.format(cust_benchmark))
    print('Original Shape: {}'.format(df.shape))
    df = df[~df['movie_id'].isin(drop_movie_list)]
    df = df[~df['customer_id'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    df.to_csv("filtered_data.csv",index= False)

def process_files(chunksize:int = 1000000, drop_date:bool = False, filter_data:bool = False):
    # Initialize the master dataframe
    master_df = pd.DataFrame()
    

    # Process data in chunks
    chunksize = chunksize
    data_files = ["combined_data_1.txt", "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
    # get the parent directory of the current script
    parent_dir = os.path.dirname(os.path.abspath("preprocess_test.ipynb"))
    # construct the path to the src directory
    src_dir = os.path.join(parent_dir, "..", "src")
    # construct the path to the files
    data_files = [os.path.join(src_dir, file) for file in data_files]
    prev_movie_id = None
    for data_file in data_files:
        for chunk in pd.read_csv(data_file, chunksize=chunksize, header=None, names=["customer_id", "rating", "date"]):
            processed_chunk, prev_movie_id = process_chunk(chunk, prev_movie_id)
            if drop_date==True:
                processed_chunk.drop(columns=["date"], inplace=True)
            master_df = pd.concat([master_df, processed_chunk], ignore_index=True)
            del processed_chunk
    
    if filter_data==True:
        filter_data(master_df)

    # Save the processed data
    master_df.to_csv("processed_data.csv", index=False)

In [106]:
process_files(1000000)

In [107]:
df = pd.read_csv('processed_data.csv')

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   customer_id  int64  
 1   rating       float64
 2   date         object 
 3   movie_id     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 3.0+ GB


In [109]:
df.head()

Unnamed: 0,customer_id,rating,date,movie_id
0,1488844,3.0,2005-09-06,1
1,822109,5.0,2005-05-13,1
2,885013,4.0,2005-10-19,1
3,30878,4.0,2005-12-26,1
4,823519,3.0,2004-05-03,1


In [111]:
df['date'] = pd.to_datetime(df['date'])

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   customer_id  int64         
 1   rating       float64       
 2   date         datetime64[ns]
 3   movie_id     int64         
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 3.0 GB


In [113]:
def parse_df(file):
    global_df = pd.DataFrame()
    with open (file,"r") as file :
        text = file.read()
        text_split = text.split(":")
        last_elt = len(text_split) - 1
    for i, elt in enumerate(text_split):
        # First element =  only a number
        if i == 0:
            index_film = int(elt)
            continue
        elif i == last_elt:
            continue
        else : 
            df_temp = create_dataframe_from_movie(elt,index_film)
            index_film +=1
            global_df = pd.concat([global_df,df_temp])
    return(global_df)   
                
        
def create_dataframe_from_movie(extract,index_movie):
    # List columns
    columns_df = ["Cust_Id","Rating","Timestamp"]
    # Eliminating last number 
    extract = "\n".join(extract.split('\n')[:-1])
    with open ("temp.csv","w") as file:
        file.write(extract)
    df = pd.read_csv("temp.csv",lineterminator='\n',sep = ",",header = None)
    df.columns = columns_df
    df["Movie_Id"] = int(index_movie)
    df.drop("Timestamp",axis = 1,inplace = True)
    print(df.head())
    return df

def concat_data():
    df_all = pd.DataFrame()
    data_files = ["combined_data_1.txt", "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
    # get the parent directory of the current script
    parent_dir = os.path.dirname(os.path.abspath("preprocess_test.ipynb"))
    # construct the path to the src directory
    src_dir = os.path.join(parent_dir, "..", "src")
    # construct the path to the files
    list_files = [os.path.join(src_dir, file) for file in data_files]
    for file in list_files:
        df_part = parse_df(file)
        df_all = pd.concat([df_all,df_part])
        df_all.to_csv("data.csv",index=False)

def filter_data():
    df = pd.read_csv("data_all.csv")
    f = ['count','mean']
    df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
    print('Movie minimum times of review: {}'.format(movie_benchmark))
    df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
    print(cust_benchmark)
    drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
    print('Customer minimum times of review: {}'.format(cust_benchmark))
    print('Original Shape: {}'.format(df.shape))
    df = df[~df['Movie_Id'].isin(drop_movie_list)]
    df = df[~df['Cust_Id'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    df.to_csv("filtering.csv",index= False)
    

In [None]:
# Benchmarking (this function took 30 minutes to run while the first took 5 minutes)
concat_data()

In [116]:
def filter_data():
    df = pd.read_csv("processed_data.csv")
    agg_functions = ['count','mean']
    df_movie_summary = df.groupby('movie_id')['rating'].agg(agg_functions)
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
    drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index
    print('Movie minimum times of review: {}'.format(movie_benchmark))
    df_cust_summary = df.groupby('customer_id')['rating'].agg(agg_functions)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
    print(cust_benchmark)
    drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index
    print('Customer minimum times of review: {}'.format(cust_benchmark))
    print('Original Shape: {}'.format(df.shape))
    df = df[~df['movie_id'].isin(drop_movie_list)]
    df = df[~df['customer_id'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    df.to_csv("filtered_data.csv",index= False)

In [117]:
filter_data()

Movie minimum times of review: 1948.0
211.0
Customer minimum times of review: 211.0
Original Shape: (100480507, 3)
After Trim Shape: (71833509, 3)


In [2]:
# read the data and change the column names
df = pd.read_csv("https://ns-recommendation-engine-bucket.s3.eu-west-3.amazonaws.com/processed_data/filtered_data.csv", names=['customer_id','rating','date'], header=None)

  df = pd.read_csv("https://ns-recommendation-engine-bucket.s3.eu-west-3.amazonaws.com/processed_data/filtered_data.csv", names=['customer_id','rating','date'], header=None)


In [11]:
df

Unnamed: 0,customer_id,rating,movie_id
0,712664,5.0,3
1,1331154,4.0,3
2,2632461,3.0,3
3,44937,5.0,3
4,656399,4.0,3
...,...,...,...
71833504,1428223,4.0,17769
71833505,483107,4.0,17769
71833506,77664,2.0,17769
71833507,1922916,3.0,17769


In [2]:
url = 'https://jedha-netflix-real-time-api.herokuapp.com/users-currently-watching-movie'
response = requests.get(url).json()

In [3]:
response

'{"columns":["YearRelease","Name","current_time","customerID"],"index":[2110,9236,1401,2104,16440,3366,14589,6333,4919,3631],"data":[[1979.0,"Alien Quadrilogy: Bonus Material",1683922843994,179166],[1998.0,"South Park: Season 2",1683922843994,179166],[1992.0,"Passion Fish",1683922843994,179166],[1989.0,"Kickboxer",1683922843994,179166],[2000.0,"It Had to Be You",1683922843994,179166],[1945.0,"Film Noir Collection: Scarlet Street",1683922843994,179166],[1961.0,"Top Cat: The Complete Series",1683922843994,179166],[1987.0,"Hiding Out",1683922843994,179166],[1997.0,"South Park: Passion of the Jew",1683922843994,179166],[1993.0,"Nemesis",1683922843994,179166]]}'

In [4]:
type(response)

str

In [5]:

res_json = json.loads(response)

In [6]:
res_json

{'columns': ['YearRelease', 'Name', 'current_time', 'customerID'],
 'index': [2110, 9236, 1401, 2104, 16440, 3366, 14589, 6333, 4919, 3631],
 'data': [[1979.0, 'Alien Quadrilogy: Bonus Material', 1683922843994, 179166],
  [1998.0, 'South Park: Season 2', 1683922843994, 179166],
  [1992.0, 'Passion Fish', 1683922843994, 179166],
  [1989.0, 'Kickboxer', 1683922843994, 179166],
  [2000.0, 'It Had to Be You', 1683922843994, 179166],
  [1945.0, 'Film Noir Collection: Scarlet Street', 1683922843994, 179166],
  [1961.0, 'Top Cat: The Complete Series', 1683922843994, 179166],
  [1987.0, 'Hiding Out', 1683922843994, 179166],
  [1997.0, 'South Park: Passion of the Jew', 1683922843994, 179166],
  [1993.0, 'Nemesis', 1683922843994, 179166]]}

In [11]:
type(res_json)

dict

In [15]:
res_json['data']

[[1979.0, 'Alien Quadrilogy: Bonus Material', 1683922843994, 179166],
 [1998.0, 'South Park: Season 2', 1683922843994, 179166],
 [1992.0, 'Passion Fish', 1683922843994, 179166],
 [1989.0, 'Kickboxer', 1683922843994, 179166],
 [2000.0, 'It Had to Be You', 1683922843994, 179166],
 [1945.0, 'Film Noir Collection: Scarlet Street', 1683922843994, 179166],
 [1961.0, 'Top Cat: The Complete Series', 1683922843994, 179166],
 [1987.0, 'Hiding Out', 1683922843994, 179166],
 [1997.0, 'South Park: Passion of the Jew', 1683922843994, 179166],
 [1993.0, 'Nemesis', 1683922843994, 179166]]

In [16]:
pd.DataFrame(res_json['data'])

Unnamed: 0,0,1,2,3
0,1979.0,Alien Quadrilogy: Bonus Material,1683922843994,179166
1,1998.0,South Park: Season 2,1683922843994,179166
2,1992.0,Passion Fish,1683922843994,179166
3,1989.0,Kickboxer,1683922843994,179166
4,2000.0,It Had to Be You,1683922843994,179166
5,1945.0,Film Noir Collection: Scarlet Street,1683922843994,179166
6,1961.0,Top Cat: The Complete Series,1683922843994,179166
7,1987.0,Hiding Out,1683922843994,179166
8,1997.0,South Park: Passion of the Jew,1683922843994,179166
9,1993.0,Nemesis,1683922843994,179166


In [7]:
headers = {'Accept': 'application/json'}

response = requests.get('https://jedha-netflix-real-time-api.herokuapp.com/users-currently-watching-movie', headers=headers)

# Now you can use the response. For example, to get the JSON data from the response:
data = response.json()


In [8]:
response

<Response [200]>

In [9]:
data

'{"columns":["YearRelease","Name","current_time","customerID"],"index":[10630,15223,11320,10791,9258,9049,12533,12543,8773,6628],"data":[[1980.0,"Tess",1683923058277,2496697],[2001.0,"Osmosis Jones",1683923058277,2496697],[1992.0,"Home Improvement: Season 2",1683923058277,2496697],[2001.0,"Teletubbies: Baby Animals",1683923058277,2496697],[1938.0,"Laurel & Hardy & Friends",1683923058277,2496697],[1997.0,"Boogie Nights",1683923058277,2496697],[1994.0,"Country Life",1683923058277,2496697],[2000.0,"Time Code",1683923058277,2496697],[1993.0,"Shadowlands",1683923058277,2496697],[2004.0,"Arna\'s Children",1683923058277,2496697]]}'

In [10]:
type(data)

str

In [17]:
my_json = {"columns":["YearRelease","Name","current_time","customerID"],"index":[1814,9499,1631,14744,6093,4597,15945,6433,8087,4690],"data":[[1998.0,"Dance with Me",1683924405448,252398],[1981.0,"Ken Burns' America: Brooklyn Bridge",1683924405448,252398],[1970.0,"On a Clear Day You Can See Forever",1683924405448,252398],[1989.0,"The Phantom of the Opera",1683924405448,252398],[1995.0,"House of Cards Trilogy III: The Final Cut",1683924405448,252398],[1965.0,"Buster Keaton Rides Again\/The Railrodder",1683924405448,252398],[1996.0,"Space Jam",1683924405448,252398],[2003.0,"Barney's Outdoor Fun!",1683924405448,252398],[2004.0,"Origins: Nova",1683924405448,252398],[1969.0,"Mackenna's Gold",1683924405448,252398]]}

In [19]:
df = pd.DataFrame(my_json['data'], columns=my_json['columns'])

In [23]:
df=pd.read_json(response.json(), orient='split')

In [24]:
df

Unnamed: 0,YearRelease,Name,current_time,customerID
10630,1980,Tess,2023-05-12 20:24:18.277,2496697
15223,2001,Osmosis Jones,2023-05-12 20:24:18.277,2496697
11320,1992,Home Improvement: Season 2,2023-05-12 20:24:18.277,2496697
10791,2001,Teletubbies: Baby Animals,2023-05-12 20:24:18.277,2496697
9258,1938,Laurel & Hardy & Friends,2023-05-12 20:24:18.277,2496697
9049,1997,Boogie Nights,2023-05-12 20:24:18.277,2496697
12533,1994,Country Life,2023-05-12 20:24:18.277,2496697
12543,2000,Time Code,2023-05-12 20:24:18.277,2496697
8773,1993,Shadowlands,2023-05-12 20:24:18.277,2496697
6628,2004,Arna's Children,2023-05-12 20:24:18.277,2496697


In [8]:
def api_to_dataframe():
    """This function sends a get request to the API, 
    and processes the response from the API and returns a list of dictionaries."""
    
    url = 'https://jedha-netflix-real-time-api.herokuapp.com/users-currently-watching-movie'
    response = requests.get(url).json()
    df=pd.read_json(response, orient='split', ).reset_index().rename(columns={'index':'Movie_Id'})
    return df

In [9]:
df = api_to_dataframe()

In [94]:
df

Unnamed: 0,index,YearRelease,Name,current_time,customerID
0,1502,1999,Superstar,2023-05-13 20:03:14.250,97714
1,9351,1967,Belle de Jour,2023-05-13 20:03:14.250,97714
2,1428,2003,The Recruit,2023-05-13 20:03:14.250,97714
3,4105,2002,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",2023-05-13 20:03:14.250,97714
4,12094,2000,Pokemon: The Advanced Master's Guide,2023-05-13 20:03:14.250,97714
5,3767,1999,Ambush,2023-05-13 20:03:14.250,97714
6,6991,2001,A History of God,2023-05-13 20:03:14.250,97714
7,2038,1957,Jean Renoir: Elena and Her Men,2023-05-13 20:03:14.250,97714
8,6002,1996,Norma Jean & Marilyn,2023-05-13 20:03:14.250,97714
9,9691,2001,ECW: The Best of Cactus Jack,2023-05-13 20:03:14.250,97714


In [95]:
df["customerID"][0]

97714

In [96]:
dict = df.to_json()

In [97]:
dict

'{"index":{"0":1502,"1":9351,"2":1428,"3":4105,"4":12094,"5":3767,"6":6991,"7":2038,"8":6002,"9":9691},"YearRelease":{"0":1999,"1":1967,"2":2003,"3":2002,"4":2000,"5":1999,"6":2001,"7":1957,"8":1996,"9":2001},"Name":{"0":"Superstar","1":"Belle de Jour","2":"The Recruit","3":"Ranma 1\\/2: Ranma Forever: Vol. 8: Someday, Somehow","4":"Pokemon: The Advanced Master\'s Guide","5":"Ambush","6":"A History of God","7":"Jean Renoir: Elena and Her Men","8":"Norma Jean & Marilyn","9":"ECW: The Best of Cactus Jack"},"current_time":{"0":1684008194250,"1":1684008194250,"2":1684008194250,"3":1684008194250,"4":1684008194250,"5":1684008194250,"6":1684008194250,"7":1684008194250,"8":1684008194250,"9":1684008194250},"customerID":{"0":97714,"1":97714,"2":97714,"3":97714,"4":97714,"5":97714,"6":97714,"7":97714,"8":97714,"9":97714}}'

In [99]:
df = pd.read_json(dict)

In [100]:
df

Unnamed: 0,index,YearRelease,Name,current_time,customerID
0,1502,1999,Superstar,2023-05-13 20:03:14.250,97714
1,9351,1967,Belle de Jour,2023-05-13 20:03:14.250,97714
2,1428,2003,The Recruit,2023-05-13 20:03:14.250,97714
3,4105,2002,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",2023-05-13 20:03:14.250,97714
4,12094,2000,Pokemon: The Advanced Master's Guide,2023-05-13 20:03:14.250,97714
5,3767,1999,Ambush,2023-05-13 20:03:14.250,97714
6,6991,2001,A History of God,2023-05-13 20:03:14.250,97714
7,2038,1957,Jean Renoir: Elena and Her Men,2023-05-13 20:03:14.250,97714
8,6002,1996,Norma Jean & Marilyn,2023-05-13 20:03:14.250,97714
9,9691,2001,ECW: The Best of Cactus Jack,2023-05-13 20:03:14.250,97714


In [101]:
df.rename(columns={'index':'Movie_Id'}, inplace=True)

In [102]:
df

Unnamed: 0,Movie_Id,YearRelease,Name,current_time,customerID
0,1502,1999,Superstar,2023-05-13 20:03:14.250,97714
1,9351,1967,Belle de Jour,2023-05-13 20:03:14.250,97714
2,1428,2003,The Recruit,2023-05-13 20:03:14.250,97714
3,4105,2002,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",2023-05-13 20:03:14.250,97714
4,12094,2000,Pokemon: The Advanced Master's Guide,2023-05-13 20:03:14.250,97714
5,3767,1999,Ambush,2023-05-13 20:03:14.250,97714
6,6991,2001,A History of God,2023-05-13 20:03:14.250,97714
7,2038,1957,Jean Renoir: Elena and Her Men,2023-05-13 20:03:14.250,97714
8,6002,1996,Norma Jean & Marilyn,2023-05-13 20:03:14.250,97714
9,9691,2001,ECW: The Best of Cactus Jack,2023-05-13 20:03:14.250,97714


In [107]:
df['Estimate_score'] = 4.0

In [109]:
df

Unnamed: 0,Movie_Id,YearRelease,Name,current_time,customerID,Estimate_score
0,1502,1999,Superstar,2023-05-13 20:03:14.250,97714,4.0
1,9351,1967,Belle de Jour,2023-05-13 20:03:14.250,97714,4.0
2,1428,2003,The Recruit,2023-05-13 20:03:14.250,97714,4.0
3,4105,2002,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",2023-05-13 20:03:14.250,97714,4.0
4,12094,2000,Pokemon: The Advanced Master's Guide,2023-05-13 20:03:14.250,97714,4.0
5,3767,1999,Ambush,2023-05-13 20:03:14.250,97714,4.0
6,6991,2001,A History of God,2023-05-13 20:03:14.250,97714,4.0
7,2038,1957,Jean Renoir: Elena and Her Men,2023-05-13 20:03:14.250,97714,4.0
8,6002,1996,Norma Jean & Marilyn,2023-05-13 20:03:14.250,97714,4.0
9,9691,2001,ECW: The Best of Cactus Jack,2023-05-13 20:03:14.250,97714,4.0


In [110]:
df = df[['customerID','Movie_Id','YearRelease','Name','current_time','Estimate_score']]
df = df.rename(columns=str.lower).sort_values('estimate_score', ascending=False).head(10)

In [112]:
df

Unnamed: 0,customerid,movie_id,yearrelease,name,current_time,estimate_score
0,97714,1502,1999,Superstar,2023-05-13 20:03:14.250,4.0
1,97714,9351,1967,Belle de Jour,2023-05-13 20:03:14.250,4.0
2,97714,1428,2003,The Recruit,2023-05-13 20:03:14.250,4.0
3,97714,4105,2002,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",2023-05-13 20:03:14.250,4.0
4,97714,12094,2000,Pokemon: The Advanced Master's Guide,2023-05-13 20:03:14.250,4.0
5,97714,3767,1999,Ambush,2023-05-13 20:03:14.250,4.0
6,97714,6991,2001,A History of God,2023-05-13 20:03:14.250,4.0
7,97714,2038,1957,Jean Renoir: Elena and Her Men,2023-05-13 20:03:14.250,4.0
8,97714,6002,1996,Norma Jean & Marilyn,2023-05-13 20:03:14.250,4.0
9,97714,9691,2001,ECW: The Best of Cactus Jack,2023-05-13 20:03:14.250,4.0


In [127]:
pivot = df.head(5).pivot(index='customerid', columns='movie_id', values='name')

In [128]:
pivot

movie_id,1428,1502,4105,9351,12094
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97714,The Recruit,Superstar,"Ranma 1/2: Ranma Forever: Vol. 8: Someday, Som...",Belle de Jour,Pokemon: The Advanced Master's Guide


In [129]:
pivot.columns

Int64Index([1428, 1502, 4105, 9351, 12094], dtype='int64', name='movie_id')

In [103]:
user_id = df["customerID"][0]
movie_id = df["Movie_Id"][0]

In [104]:
prediction = requests.get(f'https://chris-netflix-api.herokuapp.com/get-model?user_id={user_id}&movie_id={movie_id}').json()

In [106]:
type(prediction)

float

In [67]:
value = json.dumps(dict)

In [68]:
value

'"{\\"YearRelease\\":{\\"0\\":1949,\\"1\\":1996,\\"2\\":1999,\\"3\\":2003,\\"4\\":1989,\\"5\\":1967,\\"6\\":2004,\\"7\\":1996,\\"8\\":1996,\\"9\\":2003},\\"Name\\":{\\"0\\":\\"On the Town\\",\\"1\\":\\"Battle Athletes: Vol. 1: On Your Mark\\",\\"2\\":\\"Lovers Lane\\",\\"3\\":\\"Danny Deckchair\\",\\"4\\":\\"Millennium\\",\\"5\\":\\"Star Trek: The Original Series: Vols. 16-28\\",\\"6\\":\\"The Hollow\\",\\"7\\":\\"Everybody Loves Raymond: Season 1\\",\\"8\\":\\"Phat Beach\\",\\"9\\":\\"Rivers and Tides\\"},\\"current_time\\":{\\"0\\":1684004334973,\\"1\\":1684004334973,\\"2\\":1684004334973,\\"3\\":1684004334973,\\"4\\":1684004334973,\\"5\\":1684004334973,\\"6\\":1684004334973,\\"7\\":1684004334973,\\"8\\":1684004334973,\\"9\\":1684004334973},\\"customerID\\":{\\"0\\":1473699,\\"1\\":1473699,\\"2\\":1473699,\\"3\\":1473699,\\"4\\":1473699,\\"5\\":1473699,\\"6\\":1473699,\\"7\\":1473699,\\"8\\":1473699,\\"9\\":1473699}}"'

In [74]:
record_value = json.dumps(
    {
        "degrees_in_celsion": np.random.randint(10, 40)
    }
)

In [75]:
record_value

'{"degrees_in_celsion": 14}'

In [88]:
df_title = pd.read_csv('https://netflix-project-bucket.s3.eu-west-3.amazonaws.com/data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'], on_bad_lines='skip').set_index('Movie_Id')

In [90]:
df_title

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
...,...,...
17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17767,2004.0,Fidel Castro: American Experience
17768,2000.0,Epoch
17769,2003.0,The Company


In [None]:
# Send data to database
            conn = psycopg2.connect(
                host="rogue.db.elephantsql.com",
                database="wfkunnps",
                user="wfkunnps",
                password="n7fWE6yoKl5n-ebaOdbREu5hyZE7VLYo"
            )
            cur = conn.cursor()
            query = (f'INSERT INTO netflix_prediction ("User_ID", "Movie1", "Movie2", "Movie3", "Movie4", "Movie5") '
            f'VALUES ({db["User_Id"][0]},{db["data"][0]},{db["data"][1]},{db["data"][2]},{db["data"][3]},{db["data"][4]});')
            cur.execute(query)
            cur.close()
            conn.close()
            # Send data to datalake
            user_pred.to_csv('last.csv', index=False)
            aws_access_key_id = AWS_KEY_ID
            aws_secret_access_key = AWS_KEY_SECRET
            s3 = boto3.client('s3', aws_access_key_id=AWS_KEY_ID,aws_secret_access_key=AWS_KEY_SECRET)
            with open("last.csv", "rb") as f:
                # Upload the file to S3
                s3.upload_fileobj(f, "netflix-recommandation", "last_recommandation/last.csv")
            time.sleep(0.01)
            print("success")

In [130]:
df = pd.read_csv("https://ns-recommendation-engine-bucket.s3.eu-west-3.amazonaws.com/processed_data/filtered_data.csv")

In [131]:
df = df.sample(frac=0.1, random_state=42)

In [132]:
df

Unnamed: 0,customer_id,rating,movie_id
48426088,995497,3.0,12293
4393806,1510193,3.0,1180
39159530,455314,5.0,9909
64338356,2106681,4.0,15968
60324622,2578541,4.0,15043
...,...,...,...
5010483,250871,4.0,1406
10695988,841190,3.0,2862
56440,1007172,4.0,28
48739064,263291,3.0,12336


In [2]:
# Get title-id mapping for all movies
df_title = pd.read_csv('https://netflix-project-bucket.s3.eu-west-3.amazonaws.com/data/movie_titles.csv',
                       encoding = "ISO-8859-1",
                        header = None, names = ['Movie_Id', 'Year', 'Name'], 
                        on_bad_lines='skip').set_index('Movie_Id')


In [5]:
user_pred = df_title.copy().sample(100)
user_pred = user_pred.reset_index()

In [6]:
user_pred

Unnamed: 0,Movie_Id,Year,Name
0,4207,1980.0,The Blues Brothers: Extended Cut
1,14322,1987.0,Scooby-Doo Meets the Boo Brothers
2,2028,1991.0,Poison
3,477,1990.0,George Carlin: Personal Favorites
4,15399,2002.0,100 Mile Rule
...,...,...,...
95,8842,1999.0,Walking with Dinosaurs: Bonus Material
96,10234,1994.0,Trial by Jury
97,11597,1994.0,Love and a .45
98,4328,1972.0,The Ruling Class


In [7]:
def predict(user_id,movie_id):
    prediction = requests.get(f'https://chris-netflix-api.herokuapp.com/get-model?user_id={user_id}&movie_id={movie_id}').json()
    return prediction

In [10]:
record_value_df = df

In [11]:
record_value_df

Unnamed: 0,Movie_Id,YearRelease,Name,current_time,customerID
0,4238,2000,Inu-Yasha,2023-05-15 19:12:19.624,48271
1,12199,1998,Secrets of War: Nazi Warfare,2023-05-15 19:12:19.624,48271
2,14023,1952,The Road to Bali,2023-05-15 19:12:19.624,48271
3,6574,1992,Single White Female,2023-05-15 19:12:19.624,48271
4,13933,1998,Dream for an Insomniac,2023-05-15 19:12:19.624,48271
5,12607,1945,Flame of Barbary Coast,2023-05-15 19:12:19.624,48271
6,9549,2003,The Collected Shorts of Jan Svankmajer: Vol. 1,2023-05-15 19:12:19.624,48271
7,14024,1999,Sorcerer on the Rocks,2023-05-15 19:12:19.624,48271
8,10797,1936,The Three Stooges: Cops and Robbers,2023-05-15 19:12:19.624,48271
9,13788,2003,Jennifer Lopez: Let's Get Loud,2023-05-15 19:12:19.624,48271


In [14]:
record_value_df['Movie_Id'].tolist()

[4238, 12199, 14023, 6574, 13933, 12607, 9549, 14024, 10797, 13788]

In [15]:
# process latest movies the user is currently watching
user_curr_watching = record_value_df.copy() 
# get user id
user_id = user_curr_watching['customerID'][0]
# get list of movies the user is currently watching
watched_movies = user_curr_watching['Movie_Id'].tolist()
# Predict user rating for a sample of movies
user_pred = df_title.copy().sample(50)
user_pred = user_pred.reset_index()
# Remove the movies the user is currently watching from the list of movies to predict
user_pred = user_pred[~user_pred['Movie_Id'].isin(watched_movies)]
# Predict user rating for each movie using the predict function
user_pred['Estimate_Score'] = user_pred['Movie_Id'].apply(lambda x: predict(user_id,x))
# Add current time to the prediction
user_pred["current_time"] = user_curr_watching["current_time"]
# Rename columns
user_pred.rename(columns={'Year':'YearRelease'}, inplace=True)
# Add user id to the prediction dataframe
user_pred['customerID'] = user_id
# Reorganize columns order
user_pred = user_pred[['customerID','Movie_Id','YearRelease','Name','current_time','Estimate_Score']]
# Lower case column names
user_pred = user_pred.rename(columns=str.lower).sort_values('estimate_score', ascending=False)
# Renaming current_time because it created a conflict with the database
user_pred.rename(columns={'current_time':'request_time'}, inplace=True)
# Get top 5 movies
top_movies = user_pred.head(5).pivot(index='customerid', columns='movie_id', values='name')
top_movies.columns = [f'top{i+1}' for i in range(5)]
top_movies.reset_index(inplace=True)

In [11]:
url = f'http://www.omdbapi.com/?apikey=9c0d41cf&t=blade'
response = requests.get(url).json()['Poster']


In [12]:
response

'https://m.media-amazon.com/images/M/MV5BOTk2NDNjZWQtMGY0Mi00YTY2LWE5MzctMGRhZmNlYzljYTg5XkEyXkFqcGdeQXVyMTAyNjg4NjE0._V1_SX300.jpg'