# ***import libraries***

In [None]:
import pandas as pd
import numpy as np
import requests

# ***Download and Read the CSV File***

In [None]:
movie_titles = pd.read_csv("Movies title.csv")
movie_titles.head(10)

Unnamed: 0,Title
0,Guardians of the Galaxy
1,Prometheus
2,Split
3,Sing
4,Suicide Squad
5,The Great Wall
6,La La Land
7,Mindhorn
8,The Lost City of Z
9,Passengers


# ***Convert dataframe to a list***

In [None]:
movie_titles = movie_titles['Title'].tolist()
print(type(movie_titles))

<class 'list'>


# ***Fetch Data from OMDB API using a Session***

In [None]:
url = "http://www.omdbapi.com/"
api_key = "5f8914b5"  # My OMDB API key  (Karima Sobhi)

movie_data = []

with requests.Session() as session:
    for title in movie_titles:
        parameters = {
            "apikey": api_key,
            "t": title
        }

        response = session.get(url, params=parameters)
        if response.status_code == 200:
            movie_data.append(response.json())
        else:
            print(f"Failed to fetch data for {title}")

        if len(movie_data) == 100:
            break  # Stop after collecting data for 100 movies

# ***Process and use the movie data***

In [None]:
movie_data

[{'Title': 'Guardians of the Galaxy',
  'Year': '2014',
  'Rated': 'PG-13',
  'Released': '01 Aug 2014',
  'Runtime': '121 min',
  'Genre': 'Action, Adventure, Comedy',
  'Director': 'James Gunn',
  'Writer': 'James Gunn, Nicole Perlman, Dan Abnett',
  'Actors': 'Chris Pratt, Vin Diesel, Bradley Cooper',
  'Plot': 'A group of intergalactic criminals must pull together to stop a fanatical warrior with plans to purge the universe.',
  'Language': 'English',
  'Country': 'United States',
  'Awards': 'Nominated for 2 Oscars. 52 wins & 103 nominations total',
  'Poster': 'https://m.media-amazon.com/images/M/MV5BNDIzMTk4NDYtMjg5OS00ZGI0LWJhZDYtMzdmZGY1YWU5ZGNkXkEyXkFqcGdeQXVyMTI5NzUyMTIz._V1_SX300.jpg',
  'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.0/10'},
   {'Source': 'Rotten Tomatoes', 'Value': '92%'},
   {'Source': 'Metacritic', 'Value': '76/100'}],
  'Metascore': '76',
  'imdbRating': '8.0',
  'imdbVotes': '1,233,785',
  'imdbID': 'tt2015381',
  'Type': 'movie',
  'DVD

In [None]:
type(movie_data)

list

# ***Data transformation and loading***

***Transform from json to structured data format***

In [None]:
structuredData = pd.DataFrame(movie_data)

In [None]:
structuredData.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,Error,totalSeasons
0,Guardians of the Galaxy,2014,PG-13,01 Aug 2014,121 min,"Action, Adventure, Comedy",James Gunn,"James Gunn, Nicole Perlman, Dan Abnett","Chris Pratt, Vin Diesel, Bradley Cooper",A group of intergalactic criminals must pull t...,...,1233785,tt2015381,movie,15 Nov 2015,"$333,718,600",,,True,,
1,Prometheus,2012,R,08 Jun 2012,124 min,"Adventure, Mystery, Sci-Fi",Ridley Scott,"Jon Spaihts, Damon Lindelof, Dan O'Bannon","Noomi Rapace, Logan Marshall-Green, Michael Fa...","Following clues to the origin of mankind, a te...",...,626784,tt1446714,movie,01 Mar 2013,"$126,477,084",,,True,,
2,Split,2016,PG-13,20 Jan 2017,117 min,"Horror, Thriller",M. Night Shyamalan,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",Three girls are kidnapped by a man with a diag...,...,526718,tt4972582,movie,05 Apr 2017,"$138,291,365",,,True,,
3,Sing,2016,PG,21 Dec 2016,108 min,"Animation, Comedy, Family","Garth Jennings, Christophe Lourdelet",Garth Jennings,"Matthew McConaughey, Reese Witherspoon, Seth M...","In a city of humanoid animals, a hustling thea...",...,181456,tt3470600,movie,08 Mar 2017,"$270,578,425",,,True,,
4,Suicide Squad,2016,PG-13,05 Aug 2016,123 min,"Action, Adventure, Fantasy",David Ayer,"David Ayer, John Ostrander","Will Smith, Jared Leto, Margot Robbie",A secret government agency recruits some of th...,...,706414,tt1386697,movie,15 Nov 2016,"$325,100,054",,,True,,


# ***Basic Transformation***

**Data Cleaning and Preprocessing**

In [None]:
# Find unnecessary columns
columns_to_delete = ['Error', 'totalSeasons','Awards','Type','DVD','Poster','Response','Ratings','Production','Website', 'Year']

# Drop unnecessary columns
strData = structuredData.drop(columns=columns_to_delete)

#Drop rows with null values later after type conversion

# ***Drop duplicates***

In [None]:
#Drop duplicates
strData = strData.drop_duplicates()

In [None]:
strData.head()

Unnamed: 0,Title,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Metascore,imdbRating,imdbVotes,imdbID,BoxOffice
0,Guardians of the Galaxy,PG-13,01 Aug 2014,121 min,"Action, Adventure, Comedy",James Gunn,"James Gunn, Nicole Perlman, Dan Abnett","Chris Pratt, Vin Diesel, Bradley Cooper",A group of intergalactic criminals must pull t...,English,United States,76,8.0,1233785,tt2015381,"$333,718,600"
1,Prometheus,R,08 Jun 2012,124 min,"Adventure, Mystery, Sci-Fi",Ridley Scott,"Jon Spaihts, Damon Lindelof, Dan O'Bannon","Noomi Rapace, Logan Marshall-Green, Michael Fa...","Following clues to the origin of mankind, a te...","English, Gaelic","United Kingdom, United States",64,7.0,626784,tt1446714,"$126,477,084"
2,Split,PG-13,20 Jan 2017,117 min,"Horror, Thriller",M. Night Shyamalan,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",Three girls are kidnapped by a man with a diag...,English,"United States, Japan",63,7.3,526718,tt4972582,"$138,291,365"
3,Sing,PG,21 Dec 2016,108 min,"Animation, Comedy, Family","Garth Jennings, Christophe Lourdelet",Garth Jennings,"Matthew McConaughey, Reese Witherspoon, Seth M...","In a city of humanoid animals, a hustling thea...","English, Japanese","Japan, United States, France",59,7.1,181456,tt3470600,"$270,578,425"
4,Suicide Squad,PG-13,05 Aug 2016,123 min,"Action, Adventure, Fantasy",David Ayer,"David Ayer, John Ostrander","Will Smith, Jared Leto, Margot Robbie",A secret government agency recruits some of th...,"English, Japanese, Spanish",United States,40,5.9,706414,tt1386697,"$325,100,054"


In [None]:
strData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 0 to 98
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       95 non-null     object
 1   Rated       95 non-null     object
 2   Released    95 non-null     object
 3   Runtime     95 non-null     object
 4   Genre       95 non-null     object
 5   Director    95 non-null     object
 6   Writer      95 non-null     object
 7   Actors      95 non-null     object
 8   Plot        95 non-null     object
 9   Language    95 non-null     object
 10  Country     95 non-null     object
 11  Metascore   95 non-null     object
 12  imdbRating  95 non-null     object
 13  imdbVotes   95 non-null     object
 14  imdbID      95 non-null     object
 15  BoxOffice   94 non-null     object
dtypes: object(16)
memory usage: 12.8+ KB


# ***Correct Datatypes of Columns***

In [None]:
# Convert 'Runtime' to numeric
strData['Runtime'] = strData['Runtime'].apply(lambda x: pd.to_numeric(str(x).replace('min', ''), errors='coerce'))
# Convert 'Metascore' to numeric
strData['Metascore'] = pd.to_numeric(strData['Metascore'], errors='coerce')

# Convert 'imdbRating' to float64
strData['imdbRating'] = strData['imdbRating'].astype(float)

# Convert 'imdbVotes' to numeric
strData['imdbVotes'] = strData['imdbVotes'].apply(lambda x: pd.to_numeric(str(x).replace(',', ''), errors='coerce'))

# Convert 'BoxOffice' to numeric
strData["BoxOffice"] = pd.to_numeric(strData["BoxOffice"].str.replace("$", "").str.replace(",", ""), errors='coerce')



  strData["BoxOffice"] = pd.to_numeric(strData["BoxOffice"].str.replace("$", "").str.replace(",", ""), errors='coerce')


In [None]:
strData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 0 to 98
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       95 non-null     object 
 1   Rated       95 non-null     object 
 2   Released    95 non-null     object 
 3   Runtime     94 non-null     float64
 4   Genre       95 non-null     object 
 5   Director    95 non-null     object 
 6   Writer      95 non-null     object 
 7   Actors      95 non-null     object 
 8   Plot        95 non-null     object 
 9   Language    95 non-null     object 
 10  Country     95 non-null     object 
 11  Metascore   91 non-null     float64
 12  imdbRating  95 non-null     float64
 13  imdbVotes   95 non-null     float64
 14  imdbID      95 non-null     object 
 15  BoxOffice   90 non-null     float64
dtypes: float64(5), object(11)
memory usage: 12.8+ KB


In [None]:
# Drop rows with null values
strData = strData.dropna()

# Reset index after dropping nulls
strData.reset_index(drop=True, inplace=True)

#check nulls again
strData.isna().sum()

Title         0
Rated         0
Released      0
Runtime       0
Genre         0
Director      0
Writer        0
Actors        0
Plot          0
Language      0
Country       0
Metascore     0
imdbRating    0
imdbVotes     0
imdbID        0
BoxOffice     0
dtype: int64

In [None]:
# convert to date type
strData['Released'] = pd.to_datetime(strData['Released'])
strData.head()

Unnamed: 0,Title,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Metascore,imdbRating,imdbVotes,imdbID,BoxOffice
0,Guardians of the Galaxy,PG-13,2014-08-01,121.0,"Action, Adventure, Comedy",James Gunn,"James Gunn, Nicole Perlman, Dan Abnett","Chris Pratt, Vin Diesel, Bradley Cooper",A group of intergalactic criminals must pull t...,English,United States,76.0,8.0,1233785.0,tt2015381,333718600.0
1,Prometheus,R,2012-06-08,124.0,"Adventure, Mystery, Sci-Fi",Ridley Scott,"Jon Spaihts, Damon Lindelof, Dan O'Bannon","Noomi Rapace, Logan Marshall-Green, Michael Fa...","Following clues to the origin of mankind, a te...","English, Gaelic","United Kingdom, United States",64.0,7.0,626784.0,tt1446714,126477084.0
2,Split,PG-13,2017-01-20,117.0,"Horror, Thriller",M. Night Shyamalan,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",Three girls are kidnapped by a man with a diag...,English,"United States, Japan",63.0,7.3,526718.0,tt4972582,138291365.0
3,Sing,PG,2016-12-21,108.0,"Animation, Comedy, Family","Garth Jennings, Christophe Lourdelet",Garth Jennings,"Matthew McConaughey, Reese Witherspoon, Seth M...","In a city of humanoid animals, a hustling thea...","English, Japanese","Japan, United States, France",59.0,7.1,181456.0,tt3470600,270578425.0
4,Suicide Squad,PG-13,2016-08-05,123.0,"Action, Adventure, Fantasy",David Ayer,"David Ayer, John Ostrander","Will Smith, Jared Leto, Margot Robbie",A secret government agency recruits some of th...,"English, Japanese, Spanish",United States,40.0,5.9,706414.0,tt1386697,325100054.0


In [None]:
strData.to_csv("StructuredDataset.csv")

# **Advanced Transformation**

# **Date table**

In [None]:
Date = pd.DataFrame({"DateID":np.arange(1, strData.shape[0] + 1)})

Date["DAY"] = strData["Released"].dt.day
Date["MONTH"] = strData["Released"].dt.month
Date["YEAR"] = strData["Released"].dt.year

Date.insert(0, "imdbID", strData["imdbID"])
print(Date)

Date.to_csv("Date.csv", index=False)

       imdbID  DateID  DAY  MONTH  YEAR
0   tt2015381       1    1      8  2014
1   tt1446714       2    8      6  2012
2   tt4972582       3   20      1  2017
3   tt3470600       4   21     12  2016
4   tt1386697       5    5      8  2016
..        ...     ...  ...    ...   ...
83  tt2005151      84   19      8  2016
84  tt2395427      85    1      5  2015
85  tt3799694      86   20      5  2016
86  tt4255304      87    7      4  2017
87  tt4714782      88   10      3  2017

[88 rows x 5 columns]


# **Writer Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and writer IDs
WriterBridge = []
writers_set = set()  # To collect unique writer names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    writers = [w.strip() for w in row["Writer"].split(',')]
    writers_set.update(writers)  # Collect unique writer names
    for writer in writers:
        WriterBridge.append({"imdbID": imdb_id, "WriterID": count})
        count = count + 1

WriterBridge = pd.DataFrame(WriterBridge)
# Print the bridge DataFrame
print(WriterBridge)

WriterBridge.to_csv('WriterBridge.csv', index=False)

        imdbID  WriterID
0    tt2015381         1
1    tt2015381         2
2    tt2015381         3
3    tt1446714         4
4    tt1446714         5
..         ...       ...
187  tt3799694       188
188  tt3799694       189
189  tt4255304       190
190  tt4255304       191
191  tt4714782       192

[192 rows x 2 columns]


# **Writer**

In [None]:
# Create a writers DataFrame from the set of unique writer names
writers_df = pd.DataFrame({"WriterName": list(writers_set)})

# Add WriterID to writers_df
writers_df.insert(0, "WriterID" , range(1, 1 + len(writers_df)))

# Print the writers DataFrame
print(writers_df)

writers_df.to_csv('Writer.csv', index=False)

     WriterID             WriterName
0           1         Damon Lindelof
1           2             James Gray
2           3            Luke Davies
3           4     Christopher Markus
4           5           Rick Moranis
..        ...                    ...
168       169            Joss Whedon
169       170           Glenn Berger
170       171           Anya Kochoff
171       172  Tarell Alvin McCraney
172       173      Christopher Rouse

[173 rows x 2 columns]


# **Director Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and Directors IDs
DirectorBridge = []
directors_set = set()  # To collect unique Directors names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    directors = [w.strip() for w in row["Director"].split(',')]
    directors_set.update(directors)  # Collect unique Director names
    for director in directors:
        DirectorBridge.append({"imdbID": imdb_id, "DirectorID": count})
        count = count + 1

DirectorBridge = pd.DataFrame(DirectorBridge)
# Print the bridge DataFrame
print(DirectorBridge)

DirectorBridge.to_csv('DirectorBridge.csv', index=False)

       imdbID  DirectorID
0   tt2015381           1
1   tt1446714           2
2   tt4972582           3
3   tt3470600           4
4   tt3470600           5
..        ...         ...
94  tt2395427          95
95  tt3799694          96
96  tt4255304          97
97  tt4255304          98
98  tt4714782          99

[99 rows x 2 columns]


# **Director**

In [None]:
# Create a directors DataFrame from the set of unique directors names
directors_df = pd.DataFrame({"DirectorName": list(directors_set)})

# Add DirectorID to directors_df
directors_df.insert(0, "DirectorID", range(1, 1 + len(directors_df)))

# Print the directors DataFrame
print(directors_df)

directors_df.to_csv('Director.csv', index=False)

    DirectorID        DirectorName
0            1          James Gray
1            2        Duncan Jones
2            3         Yimou Zhang
3            4        Terry George
4            5       David Frankel
..         ...                 ...
84          85  M. Night Shyamalan
85          86         Ben Affleck
86          87       Todd Phillips
87          88         Joss Whedon
88          89       Morten Tyldum

[89 rows x 2 columns]


# **Actor Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and Actor IDs
ActorBridge = []
actors_set = set()  # To collect unique Actors names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    actors = [w.strip() for w in row["Actors"].split(',')]
    actors_set.update(actors)  # Collect unique Actors names
    for actor in actors:
        ActorBridge.append({"imdbID": imdb_id, "ActorID": count})
        count = count + 1

ActorBridge = pd.DataFrame(ActorBridge)

# Change the data type of the "ActorID" column to TINYINT
ActorBridge['ActorID'] = ActorBridge['ActorID'].astype('int8')

# Print the bridge DataFrame
print(ActorBridge)

ActorBridge.to_csv('ActorBridge.csv', index=False)

        imdbID  ActorID
0    tt2015381        1
1    tt2015381        2
2    tt2015381        3
3    tt1446714        4
4    tt1446714        5
..         ...      ...
259  tt4255304        4
260  tt4255304        5
261  tt4714782        6
262  tt4714782        7
263  tt4714782        8

[264 rows x 2 columns]


# **Actor**

In [None]:
# Create a actors DataFrame from the set of unique actors names
actors_df = pd.DataFrame({"ActorName": list(actors_set)})

# Add ActorID to actors_df
actors_df.insert(0, "ActorID", range(1, 1 + len(actors_df)))

# Change the data type of the "ActorID" column to TINYINT
actors_df['ActorID'] = actors_df['ActorID'].astype('int8')

# Print the actors DataFrame
print(actors_df)

actors_df.to_csv('Actor.csv', index=False)

     ActorID          ActorName
0          1     Mahershala Ali
1          2       Heath Ledger
2          3  Gabriel Chavarria
3          4      Rosamund Pike
4          5     Gemma Arterton
..       ...                ...
203      -52      Casey Affleck
204      -51  Reese Witherspoon
205      -50  Michelle Monaghan
206      -49    Michael Shannon
207      -48        Kate Hudson

[208 rows x 2 columns]


# **Country Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and Country IDs
CountryBridge = []
countries_set = set()  # To collect unique Country names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    countries = [w.strip() for w in row["Country"].split(',')]
    countries_set.update(countries)  # Collect unique Country names
    for country in countries:
        CountryBridge.append({"imdbID": imdb_id, "CountryID": count})
        count = count + 1

CountryBridge = pd.DataFrame(CountryBridge)
# Print the bridge DataFrame
print(CountryBridge)

CountryBridge.to_csv('CountryBridge.csv', index=False)

        imdbID  CountryID
0    tt2015381          1
1    tt1446714          2
2    tt1446714          3
3    tt4972582          4
4    tt4972582          5
..         ...        ...
162  tt4255304        163
163  tt4714782        164
164  tt4714782        165
165  tt4714782        166
166  tt4714782        167

[167 rows x 2 columns]


# **Country**

In [None]:
# Create a Country DataFrame from the set of unique country names
country_df = pd.DataFrame({"Country": list(countries_set)})

# Add countryID to country_df
country_df.insert(0, "CountryID", range(1, 1 + len(country_df)))

# Print the countries DataFrame
print(country_df)

country_df.to_csv('Country.csv', index=False)

    CountryID         Country
0           1          Taiwan
1           2     South Korea
2           3          Mexico
3           4         Hungary
4           5       Australia
5           6          Greece
6           7  Czech Republic
7           8         Romania
8           9   United States
9          10           Malta
10         11           India
11         12  United Kingdom
12         13          France
13         14          Sweden
14         15         Germany
15         16           Japan
16         17           China
17         18          Canada
18         19           Spain
19         20         Belgium
20         21       Hong Kong


# **Language Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and Language  IDs
LanguageBridge = []
Language_set = set()  # To collect unique Language  names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    Languages = [w.strip() for w in row["Language"].split(',')]
    Language_set.update(Languages)  # Collect unique Language  names
    for Language in Languages:
        LanguageBridge.append({"imdbID": imdb_id, "LanguageID": count})
        count = count + 1

LanguageBridge = pd.DataFrame(LanguageBridge)
# Print the bridge DataFrame
print(LanguageBridge)

LanguageBridge.to_csv('LanguageBridge.csv', index=False)

        imdbID  LanguageID
0    tt2015381           1
1    tt1446714           2
2    tt1446714           3
3    tt4972582           4
4    tt3470600           5
..         ...         ...
178  tt4255304         179
179  tt4714782         180
180  tt4714782         181
181  tt4714782         182
182  tt4714782         183

[183 rows x 2 columns]


# **Language**

In [None]:
# Create a Language DataFrame from the set of unique Language names
Language_df = pd.DataFrame({"Language": list(Language_set)})

# Add LanguageID to Language_df
Language_df.insert(0, "LanguageID", range(1, 1 + len(Language_df)))

# Print the Language DataFrame
print(Language_df)

Language_df.to_csv('Language.csv', index=False)

    LanguageID               Language
0            1                 Gaelic
1            2             Portuguese
2            3                 German
3            4     Egyptian (Ancient)
4            5               Armenian
5            6                Persian
6            7                Russian
7            8                English
8            9              Hungarian
9           10                Chinese
10          11                 Arabic
11          12               Romanian
12          13                Spanish
13          14               Japanese
14          15                Bengali
15          16                Yiddish
16          17                  Hindi
17          18                  Latin
18          19              Cantonese
19          20                Turkish
20          21                   Tupi
21          22                  Greek
22          23                 Polish
23          24  North American Indian
24          25              Esperanto
25          

# **Genre Bridge**

In [None]:
# Create a bridge DataFrame for IMDb IDs and Genre IDs
GenreBridge = []
Genre_set = set()  # To collect unique Genre  names
count = 1
for index, row in strData.iterrows():
    imdb_id = row["imdbID"]
    Genres = [w.strip() for w in row["Genre"].split(',')]
    Genre_set.update(Genres)  # Collect unique Genre  names
    for Genre in Genres:
        GenreBridge.append({"imdbID": imdb_id, "GenreID": count})
        count = count + 1

GenreBridge = pd.DataFrame(GenreBridge)
# Print the bridge DataFrame
print(GenreBridge)

GenreBridge.to_csv('GenreBridge.csv', index=False)

        imdbID  GenreID
0    tt2015381        1
1    tt2015381        2
2    tt2015381        3
3    tt1446714        4
4    tt1446714        5
..         ...      ...
230  tt4255304      231
231  tt4255304      232
232  tt4255304      233
233  tt4714782      234
234  tt4714782      235

[235 rows x 2 columns]


# **Genre**

In [None]:
# Create a Genre DataFrame from the set of unique Genre names
Genre_df = pd.DataFrame({"Genre": list(Genre_set)})

# Add GenreID to Genre_df
Genre_df.insert(0, "GenreID", range(1, 1 + len(Genre_df)))

# Print the Genre DataFrame
print(Genre_df)

Genre_df.to_csv('Genre.csv', index=False)

    GenreID      Genre
0         1  Biography
1         2    Romance
2         3      Crime
3         4   Thriller
4         5    Mystery
5         6        War
6         7    Fantasy
7         8    History
8         9     Action
9        10      Drama
10       11      Music
11       12     Horror
12       13  Adventure
13       14     Sci-Fi
14       15     Comedy
15       16  Animation
16       17     Family


# **Movie Description**

In [None]:
# Create the Movie Description DataFrame with IMDb IDs, titles, plots and Rated
MovieDescription = strData[["imdbID", "Title", "Plot", "Rated"]]

MovieDescription.to_csv("MovieDescription.csv")
MovieDescription.head(10)

Unnamed: 0,imdbID,Title,Plot,Rated
0,tt2015381,Guardians of the Galaxy,A group of intergalactic criminals must pull t...,PG-13
1,tt1446714,Prometheus,"Following clues to the origin of mankind, a te...",R
2,tt4972582,Split,Three girls are kidnapped by a man with a diag...,PG-13
3,tt3470600,Sing,"In a city of humanoid animals, a hustling thea...",PG
4,tt1386697,Suicide Squad,A secret government agency recruits some of th...,PG-13
5,tt2034800,The Great Wall,"In ancient China, a group of European mercenar...",PG-13
6,tt3783958,La La Land,"While navigating their careers in Los Angeles,...",PG-13
7,tt1212428,The Lost City of Z,"A true-life drama, centering on British explor...",PG-13
8,tt1355644,Passengers,A malfunction in a sleeping pod on a spacecraf...,PG-13
9,tt3183660,Fantastic Beasts and Where to Find Them,The adventures of writer Newt Scamander in New...,PG-13


# **Movie (our Fact Table)**

In [None]:
# Merge DataFrames with WriterBridge, DirectorBridge, ActorBridge, MovieDescription, Rating, and Date
merged_Movie_data = strData[["imdbID", "Runtime", "Metascore", "BoxOffice", "imdbRating", "imdbVotes"]]

merged_Movie_data = pd.merge(merged_Movie_data, Date, on="imdbID", how="left")

# # Create the "Movie" DataFrame with IMDb IDs, Metascore and other relevant columns
Movie = merged_Movie_data[["imdbID","Runtime", "Metascore", "BoxOffice", "imdbRating", "imdbVotes", "DateID"]]
Movie.to_csv("Movie.csv")
Movie.head()

Unnamed: 0,imdbID,Runtime,Metascore,BoxOffice,imdbRating,imdbVotes,DateID
0,tt2015381,121.0,76.0,333718600.0,8.0,1233785.0,1
1,tt1446714,124.0,64.0,126477084.0,7.0,626784.0,2
2,tt4972582,117.0,63.0,138291365.0,7.3,526718.0,3
3,tt3470600,108.0,59.0,270578425.0,7.1,181456.0,4
4,tt1386697,123.0,40.0,325100054.0,5.9,706414.0,5
