In [1]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import joblib



def gener(data):
    no_genre = data[pd.isnull(data['genres'])]
    i = data[pd.isnull(data['genres'])].index
    listgenre = ['Science Fiction & Fantasy','Drama','Animation','Animation','Animation','Action & Adventure','Musical & Performing Arts','Romance','Action & Adventure','Drama','Comedy','Animation','Action & Adventure','Horror','Action & Adventure','Mystery & Suspense','Science Fiction & Fantasy','Documentary','Animation']

    for g in range(0,len(no_genre)):
        data.loc[i[g],'genres'] = listgenre[g]
        continue

    df = data['genres'].str.get_dummies(',')
    data = pd.concat([data,df],axis=1)
    return data
def encoding(data):
    drop_column = ["movie_title","rotten_tomatoes_link","movie_info","original_release_date","streaming_release_date","critics_consensus","genres","directors","authors","actors","production_company"]
    data.drop(drop_column,axis=1,inplace=True)

    cont_col = list(data.describe())
    cat_col = list(c for c in data.columns if c not in cont_col)

    cont_data = data.loc[:, cont_col]
    cat_data = data.loc[:, cat_col]

    imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
    cont_data = imputer.fit_transform(cont_data)

    imputer2 = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
    cat_data = imputer2.fit_transform(cat_data)

    cat_data = pd.DataFrame(cat_data)
    cat_data.columns = cat_col
    cont_data = pd.DataFrame(cont_data)
    cont_data.columns = cont_col

    oe = OrdinalEncoder(categories=[['Rotten','Fresh','Certified-Fresh']])
    cat_data['tomatometer_status'] = oe.fit_transform(cat_data[['tomatometer_status']])

    data = pd.concat([cat_data,cont_data],axis=1)
    column_name = 'tomatometer_rating'

    column = data.pop(column_name)

    data[column_name] = column
    return data

transformer_gener = FunctionTransformer(gener)
transformer_encoding = FunctionTransformer(encoding)

pipeline = make_pipeline(
    transformer_gener,
    transformer_encoding
)

data = pd.read_csv('/kaggle/input/rotten-tomatoes-movies-and-critic-reviews-dataset/rotten_tomatoes_movies.csv')

data = pipeline.transform(data)
joblib_filename = 'processed_data_pipeline.joblib'

joblib.dump(pipeline, joblib_filename)
print(f"Data Pipeline has been saved as {joblib_filename}")

Data Pipeline has been saved as processed_data_pipeline.joblib


In [2]:
data

Unnamed: 0,content_rating,tomatometer_status,audience_status,runtime,tomatometer_count,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,...,Horror,Kids & Family,Musical & Performing Arts,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest,Television,Western,tomatometer_rating
0,PG,0.0,Spilled,119.0,149.0,53.0,254421.0,43.0,73.0,76.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
1,R,2.0,Upright,90.0,142.0,64.0,11574.0,44.0,123.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,87.0
2,R,1.0,Spilled,122.0,24.0,53.0,14684.0,2.0,16.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0
3,NR,2.0,Upright,95.0,54.0,97.0,105386.0,6.0,54.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
4,G,1.0,Upright,127.0,27.0,74.0,68918.0,5.0,24.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17707,R,0.0,Upright,104.0,9.0,74.0,1195.0,2.0,5.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
17708,PG,2.0,Upright,108.0,291.0,92.0,101511.0,50.0,285.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.0
17709,NR,1.0,Upright,142.0,10.0,86.0,7146.0,0.0,8.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0
17710,PG,1.0,Upright,135.0,23.0,91.0,30193.0,6.0,22.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0


In [None]:
from pymongo import MongoClient

mongo_host = 'localhost'
mongo_port = 27017  # Default MongoDB port
mongo_database = 'BdaProject'
mongo_collection = 'movieRatings'

client = MongoClient(mongo_host, mongo_port)
db = client[mongo_database]
collection = db[mongo_collection]

# Convert DataFrame to a list of dictionaries
data_to_insert = data.to_dict(orient='records')

# Insert data into MongoDB
collection.insert_many(data_to_insert)

# Close the MongoDB connection
client.close()
