In [108]:
# !pip install pillow

In [109]:
%%time
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from concurrent.futures import ThreadPoolExecutor
from sklearn.decomposition import PCA
from PIL import Image
import os
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

CPU times: total: 0 ns
Wall time: 0 ns


In [110]:
# 讀取csv檔案
df = pd.read_csv("./data/movie_list_20231126.csv")

In [111]:
df.dtypes

id                           int64
imdb_id                     object
title                       object
vote_average               float64
vote_count                   int64
status                      object
release_date                object
revenue                      int64
runtime                      int64
isAdult                       bool
budget                       int64
original_language           object
original_title              object
overview                    object
popularity                 float64
tagline                     object
genres                      object
Action                       int64
Adult                        int64
Adventure                    int64
Animation                    int64
Biography                    int64
Comedy                       int64
Crime                        int64
Documentary                  int64
Drama                        int64
Family                       int64
Fantasy                      int64
GameShow            

In [112]:
x = list(df.head(0))
y = [
    "Action",
    "Adult",
    "Adventure",
    "Animation",
    "Biography",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Family",
    "Fantasy",
    "GameShow",
    "History",
    "Horror",
    "Music",
    "Musical",
    "Mystery",
    "News",
    "RealityTV",
    "Romance",
    "SciFi",
    "Short",
    "Sport",
    "TalkShow",
    "Thriller",
    "TV_Movie",
    "War",
    "Western",
]

z = [
    "oscarNominations",
    "oscarNominationCategory",
    "oscarWinner",
    "oscarWinnerCategory",
    "cast1",
    "cast2",
    "cast3",
    "cast4",
    "isActingWinner",
]

for j in y:
    x.remove(j)

for j in z:
    x.remove(j)
x

['id',
 'imdb_id',
 'title',
 'vote_average',
 'vote_count',
 'status',
 'release_date',
 'revenue',
 'runtime',
 'isAdult',
 'budget',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'tagline',
 'genres',
 'production_companies',
 'production_countries',
 'spoken_languages',
 'director',
 'writer',
 'cast',
 'rating',
 'numVotes']

In [113]:
for j in x:
    count = 0
    for i in df[j]:
        if i == 0:
            count += 1
    if count > 0:
        print(j, count)

# Name: isAdult, dtype: bool

vote_average 127977
vote_count 127885
revenue 263823
runtime 23881
isAdult 253860
budget 250661
popularity 1553
rating 67348
numVotes 67348


In [114]:
# 刪除 revenue 為 0 的資料
df = df[df["revenue"] != 0]
df.shape

(10885, 62)

In [115]:
# 刪除 budget 為 0 的資料
df = df[df["budget"] != 0]
df.shape

(7939, 62)

In [116]:
# 刪除 runtime 為 0 的資料
df = df[df["runtime"] != 0]
df.shape

(7907, 62)

In [117]:
# 尚未Released, 卻有票房?!
df[df["status"] != "Released"]

Unnamed: 0,id,imdb_id,title,vote_average,vote_count,status,release_date,revenue,runtime,isAdult,budget,original_language,original_title,overview,popularity,tagline,genres,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,GameShow,History,Horror,Music,Musical,Mystery,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,TV_Movie,War,Western,production_companies,production_countries,spoken_languages,director,writer,cast,rating,numVotes,oscarNominations,oscarNominationCategory,oscarWinner,oscarWinnerCategory,cast1,cast2,cast3,cast4,isActingWinner
205409,1116492,tt27351080,Dial,0.0,0,Post Production,2023-10-31,5000,15,False,5000,en,Dial,Dani cares for her ill mother who suddenly die...,1.088,Mother is calling,"Drama, Horror",0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Trett Films,United Kingdom,English,Josh Trett,Josh Trett,"Olivia Bourne, Denise Stephenson, Greg Lindsay...",0.0,0,0,,0,,Olivia Bourne,Denise Stephenson,Greg Lindsay-Smith,Sonia Soomessur,False
208971,1172595,tt28689884,Mother's Ruin: Unmasking the WMSCOG,0.0,0,In Production,2023-10-07,300,24,False,300,en,Mother's Ruin: Unmasking the WMSCOG,"In this investigative exposé, award-winning jo...",5.11,They don't want you watching this.,Documentary,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,English,Harry Robinson,Harry Robinson,,0.0,0,0,,0,,,,,,False


In [118]:
df = df[df["status"] == "Released"]

In [119]:
for j in x:
    count = 0
    for i in df[j]:
        if i == 0:
            count += 1
    if count > 0:
        print(j, count)

vote_average 231
vote_count 230
isAdult 7891
popularity 3
rating 258
numVotes 258


In [120]:
# 計算每列的缺失值數量
missing_values = df.isnull().sum()

# 過濾出缺失值不等於零的列
non_zero_missing_values = missing_values[missing_values != 0]

# 顯示統計信息
print(non_zero_missing_values)

overview                      5
tagline                     769
production_companies        263
production_countries        233
spoken_languages            189
director                     55
writer                      152
cast                        173
oscarNominationCategory    6405
oscarWinnerCategory        7324
cast1                       173
cast2                       241
cast3                       299
cast4                       364
dtype: int64


In [121]:
df = df.dropna(subset=["cast"])
df = df.dropna(subset=["director"])
df = df.dropna(subset=["writer"])
df = df.dropna(subset=["production_companies"])
df = df.dropna(subset=["production_countries"])
df = df.dropna(subset=["spoken_languages"])
df.shape

(7309, 62)

In [122]:
missing_values = df.isnull().sum()
non_zero_missing_values = missing_values[missing_values != 0]
print(non_zero_missing_values)

overview                      2
tagline                     548
oscarNominationCategory    5827
oscarWinnerCategory        6735
cast2                        34
cast3                        60
cast4                        93
dtype: int64


In [123]:
# 使用 drop 方法刪除欄
df = df.drop("overview", axis=1)
df = df.drop("cast1", axis=1)
df = df.drop("cast2", axis=1)
df = df.drop("cast3", axis=1)
df = df.drop("cast4", axis=1)
df = df.drop("oscarNominationCategory", axis=1)
df = df.drop("oscarWinnerCategory", axis=1)
df = df.drop("tagline", axis=1)
df.shape

(7309, 54)

In [124]:
df["spoken_languages"].unique()

array(['English, No Language', 'No Language', 'English',
       'English, Spanish, Yiddish', 'English, French, German, Latin',
       'French, English, German', 'Mandarin, English, French',
       'English, French', 'English, Hungarian, Latin', 'English, German',
       'English, Spanish', 'English, Latin, Hungarian, French',
       'English, Spanish, French', 'French, English',
       'French, Italian, English', 'English, Italian',
       'English, Tahitian', 'English, Russian',
       'French, English, Italian', 'Spanish, English', 'English, Hindi',
       'English, Latin', 'English, Esperanto', 'English, Welsh',
       'German, French, English', 'German, French, Italian, English',
       'English, Serbian', 'German, English', 'Japanese, English',
       'English, Italian, German', 'English, French, German',
       'English, French, German, Spanish', 'Cantonese, English',
       'English, Italian, Portuguese, Spanish', 'English, Swahili',
       'Latin, Spanish, English', 'English, S

In [125]:
lan = []

for l in df["spoken_languages"]:
    for ll in l.split(", "):
        lan.append(ll)
lan

['English',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'Spanish',
 'Yiddish',
 'English',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'No Language',
 'English',
 'No Language',
 'No Language',
 'No Language',
 'English',
 'English',
 'English',
 'English',
 'English',
 'No Language',
 'English',
 'English',
 'English',
 'English',
 'French',
 'German',
 'Latin',
 'French',
 'English',
 'German',
 'English',
 'English',
 'Engl

In [126]:
from collections import Counter

# 使用 Counter 計算每個語言的次數
language_counts = Counter(lan)

# 將結果轉換為字典，其中鍵是語言，值是出現次數
language_counts_dict = dict(language_counts)

# 將字典按照值（出現次數）降序排序
sorted_language_counts = dict(
    sorted(language_counts_dict.items(), key=lambda item: item[1], reverse=True)
)

# 輸出結果
print(sorted_language_counts)

{'English': 7227, 'Spanish': 593, 'French': 550, 'German': 338, 'Italian': 312, 'Russian': 244, 'Japanese': 134, 'Mandarin': 119, 'Arabic': 106, 'Latin': 92, 'Portuguese': 77, 'Cantonese': 57, 'No Language': 44, 'Hebrew': 36, 'Polish': 35, 'Greek': 33, 'Hungarian': 32, 'Hindi': 32, 'Korean': 31, 'Swedish': 30, 'Turkish': 27, 'Vietnamese': 26, 'Thai': 25, 'Czech': 25, 'Yiddish': 18, 'Dutch': 16, 'Romanian': 16, 'Urdu': 16, 'Ukrainian': 15, 'Swahili': 13, 'Irish': 13, 'Norwegian': 13, 'Persian': 12, 'Icelandic': 11, 'Afrikaans': 11, 'Indonesian': 11, 'Serbian': 10, 'Tagalog': 10, 'Danish': 9, 'Gaelic': 8, 'Croatian': 8, 'Georgian': 8, 'Bulgarian': 8, 'Finnish': 7, 'Tibetan': 7, 'Armenian': 7, 'Malay': 6, 'Xhosa': 6, 'Punjabi': 5, 'Esperanto': 4, 'Navajo': 4, 'Serbo-Croatian': 4, 'Slovak': 4, 'Albanian': 4, 'Somali': 4, 'Bengali': 4, 'Estonian': 4, 'Welsh': 3, 'Zulu': 3, 'Khmer': 3, 'Akan': 3, 'Haitian; Haitian Creole': 3, 'Bosnian': 3, 'Sanskrit': 3, 'Basque': 2, 'Samoan': 2, 'Nepali': 2

In [127]:
df["spoken_languages_en"] = 0
df["spoken_languages_other"] = 0

for index, row in df.iterrows():
    if "English" in str(row["spoken_languages"]):
        df.at[index, "spoken_languages_en"] += 1
        df.at[index, "spoken_languages_other"] += row["spoken_languages"].count(",")
    else:
        df.at[index, "spoken_languages_other"] += row["spoken_languages"].count(",")
        df.at[index, "spoken_languages_other"] += 1

In [128]:
df["spoken_languages_other"]

912        1
1327       1
1489       1
1555       1
1676       1
1962       1
2022       0
2245       1
2330       1
3162       1
3285       0
3734       1
4174       1
4475       1
4548       1
4881       1
5494       1
5634       1
5760       1
5771       1
5960       1
6086       1
6121       1
6216       1
6227       1
6231       0
6541       1
6624       1
6629       1
6631       1
6764       1
7011       1
7052       1
7330       1
7385       2
7416       1
7461       0
7547       1
7677       1
7823       1
7950       0
7998       1
8020       0
8023       1
8083       1
8213       1
8297       0
8330       0
8355       0
8418       0
8569       0
8582       1
8625       0
8641       0
8751       0
8783       3
8998       2
9070       0
9172       0
9231       0
9285       2
9444       0
9512       0
9524       1
9526       1
9569       2
9604       0
9614       0
9859       0
9998       0
10082      0
10216      0
10225      0
10292      0
10419      0
10455      0
10536      0

In [129]:
df["production_companies"].unique

<bound method Series.unique of 912                             Independent Moving Pictures
1327                    Jesse L. Lasky Feature Play Company
1489                                         Epoch Film Co.
1555         Jesse L. Lasky Feature Play Company, Paramount
1676                    Jesse L. Lasky Feature Play Company
1962                    Jesse L. Lasky Feature Play Company
2022                            The Universal Film Mfg. Co.
2245        Triangle Film Corporation, Wark Producing Corp.
2330                    Jesse L. Lasky Feature Play Company
3162                           Herbert M. Dawley Production
3285                     Mabel Normand Feature Film Company
3734                 Paramount, Mayflower Photoplay Company
4174                                   Fox Film Corporation
4475                       Famous Players-Lasky Corporation
4548                            Charles Chaplin Productions
4881                   Universal Film Manufacturing Company
5494     

In [130]:
com = []

for l in df["production_companies"]:
    for ll in l.split(", "):
        com.append(ll)
com

['Independent Moving Pictures',
 'Jesse L. Lasky Feature Play Company',
 'Epoch Film Co.',
 'Jesse L. Lasky Feature Play Company',
 'Paramount',
 'Jesse L. Lasky Feature Play Company',
 'Jesse L. Lasky Feature Play Company',
 'The Universal Film Mfg. Co.',
 'Triangle Film Corporation',
 'Wark Producing Corp.',
 'Jesse L. Lasky Feature Play Company',
 'Herbert M. Dawley Production',
 'Mabel Normand Feature Film Company',
 'Paramount',
 'Mayflower Photoplay Company',
 'Fox Film Corporation',
 'Famous Players-Lasky Corporation',
 'Charles Chaplin Productions',
 'Universal Film Manufacturing Company',
 'Hal Roach Studios',
 'Warner Bros. Pictures',
 'The Harold Lloyd Corporation',
 'Metro-Goldwyn-Mayer',
 'Douglas Fairbanks Pictures',
 'Metro-Goldwyn-Mayer',
 'Warner Bros. Pictures',
 'Charles Chaplin Productions',
 'Metro-Goldwyn Pictures Corporation',
 'Metro-Goldwyn-Mayer',
 'Metro-Goldwyn-Mayer',
 'Paramount',
 'Metro-Goldwyn-Mayer',
 'Warner Bros. Pictures',
 'Metro-Goldwyn-Mayer',
 '

In [131]:
company_counts = Counter(com)

company_counts_dict = dict(company_counts)


sorted_company_counts = dict(
    sorted(company_counts_dict.items(), key=lambda item: item[1], reverse=True)
)

print(sorted_company_counts)

{'Warner Bros. Pictures': 626, 'Universal Pictures': 578, 'Paramount': 473, '20th Century Fox': 473, 'Columbia Pictures': 445, 'Metro-Goldwyn-Mayer': 369, 'New Line Cinema': 229, 'Walt Disney Pictures': 184, 'Touchstone Pictures': 165, 'Lionsgate': 154, 'Miramax': 147, 'United Artists': 144, 'DreamWorks Pictures': 123, 'Relativity Media': 123, 'Summit Entertainment': 122, 'TriStar Pictures': 117, 'Village Roadshow Pictures': 98, 'StudioCanal': 96, 'Focus Features': 94, 'Regency Enterprises': 92, 'Canal+': 89, 'Film4 Productions': 84, 'Working Title Films': 80, 'Screen Gems': 78, 'Dimension Films': 77, 'Fox Searchlight Pictures': 77, 'Amblin Entertainment': 75, 'Orion Pictures': 72, 'Dune Entertainment': 69, 'Blumhouse Productions': 66, 'The Weinstein Company': 65, 'Imagine Entertainment': 64, 'Millennium Media': 63, 'TSG Entertainment': 63, 'Fox 2000 Pictures': 61, 'BBC Film': 58, 'Mandate International': 57, 'Hollywood Pictures': 56, 'Scott Rudin Productions': 56, 'New Regency Picture

In [132]:
df.shape

(7309, 56)

In [133]:
sorted_company_counts

{'Warner Bros. Pictures': 626,
 'Universal Pictures': 578,
 'Paramount': 473,
 '20th Century Fox': 473,
 'Columbia Pictures': 445,
 'Metro-Goldwyn-Mayer': 369,
 'New Line Cinema': 229,
 'Walt Disney Pictures': 184,
 'Touchstone Pictures': 165,
 'Lionsgate': 154,
 'Miramax': 147,
 'United Artists': 144,
 'DreamWorks Pictures': 123,
 'Relativity Media': 123,
 'Summit Entertainment': 122,
 'TriStar Pictures': 117,
 'Village Roadshow Pictures': 98,
 'StudioCanal': 96,
 'Focus Features': 94,
 'Regency Enterprises': 92,
 'Canal+': 89,
 'Film4 Productions': 84,
 'Working Title Films': 80,
 'Screen Gems': 78,
 'Dimension Films': 77,
 'Fox Searchlight Pictures': 77,
 'Amblin Entertainment': 75,
 'Orion Pictures': 72,
 'Dune Entertainment': 69,
 'Blumhouse Productions': 66,
 'The Weinstein Company': 65,
 'Imagine Entertainment': 64,
 'Millennium Media': 63,
 'TSG Entertainment': 63,
 'Fox 2000 Pictures': 61,
 'BBC Film': 58,
 'Mandate International': 57,
 'Hollywood Pictures': 56,
 'Scott Rudin 

In [134]:
# Universal Pictures
Universal = """
Focus Features
Focus World
High Top Releasing
Working Title Films
Big Idea Entertainment
Bullwinkle Studios
DreamWorks Animation
DreamWorks Classics
Illumination
Illumination Studios Paris
Universal Animation Studios
Amblin Partners
Carnival Films
Makeready
NBCUniversal Japan
OTL Releasing
United International Pictures
Universal 1440 Entertainment
WT2 Productions
Peacock
Hayu
Vudu
SkyShowtime
"""

In [135]:
Universal = Universal.split("\n")

In [136]:
Universal.pop(0)
Universal.pop(-1)

''

In [137]:
for i in Universal:
    df["production_companies"] = df["production_companies"].apply(
        lambda x: x.replace(i, "Universal Pictures")
    )

In [138]:
# Paramount
Paramount = """
Miramax
BET Films
MTV Entertainment Studios
Nickelodeon Movies
Paramount Players
Republic Pictures
CBS Eye Animation Productions
Avatar Studios
MTV Animation
Nickelodeon Animation Studio
Paramount Animation
Awesomeness Films
Miramax Family
Paramount Digital Entertainment
United International Pictures
Viacom18 Studios
Paramount Digital Studios
Jio Studios
Paramount+
Pluto TV
BET+
Noggin
Nick+
JioCinema
My5
Philo
SkyShowtime
"""

In [139]:
Paramount = Paramount.split("\n")

Paramount.pop(0)
Paramount.pop(-1)

for i in Paramount:
    df["production_companies"] = df["production_companies"].apply(
        lambda x: x.replace(i, "Paramount")
    )

In [140]:
# Warner Bros. Pictures
Warner = """
New Line Cinema
HBO Films
Spyglass Media Group
Rooster Teeth Studios
Discovery Films
TruTV Films
CNN Films
DC Studios
Cartoon Network Movies
Cartoon Network Studios
Wang Film Productions
Warner Bros. Animation
Warner Bros. Japan Anime
Warner Bros. Pictures Animation
Alloy Entertainment
DC Entertainment
Cinemax Films
Flagship Entertainment Group
Castle Rock Entertainment
Turner Entertainment Co.
Warner Bros. Japan
Max
Discovery+
Vudu
Philo
"""

In [141]:
Warner = Warner.split("\n")

Warner.pop(0)
Warner.pop(-1)

for i in Warner:
    df["production_companies"] = df["production_companies"].apply(
        lambda x: x.replace(i, "Warner Bros. Pictures")
    )

In [142]:
# Walt Disney Pictures
Disney = """
20th Century Fox
20th Century Studios
A&E IndieFilms
Disneynature
Hulu Documentary Films
Searchlight Pictures
Disney Channel Original Movies
ESPN Films
NatGeo Doc Films
Lucasfilm
Marvel Studios
Freeform Original Productions
20th Century Animation
Lucasfilm Animation
Marvel Animation
Marvel Studios Animation
Pixar
Walt Disney Animation Studios
20th Century Family
A&E Films
Walt Disney Pictures India
Regency Enterprises
Star Studios
Marvel Entertainment
Vice Films
Walt Disney Studios Motion Pictures
Walt Disney Studios Sony Pictures Releasing
Disney+
Hulu
ESPN+
Disney+ Hotstar
Star+
Movies Anywhere
Philo
"""

In [143]:
Disney = Disney.split("\n")

Disney.pop(0)
Disney.pop(-1)

for i in Disney:
    df["production_companies"] = df["production_companies"].apply(
        lambda x: x.replace(i, "Walt Disney Pictures")
    )

In [144]:
# Columbia Pictures
Columbia = """
TriStar Pictures
Sony Pictures Classics
Affirm Films
Ghost Corps
Screen Gems
Stage 6 Films
Crunchyroll, LLC
Crunchyroll Store
Crunchyroll UK and Ireland
Crunchyroll EMEA
Crunchyroll Store Australia
Madhouse
Sony Pictures Animation
Sony Pictures Imageworks
3000 Pictures
Destination Films
Left Bank Pictures
Sony Pictures Japan
Sony Pictures Releasing
Sony Pictures Worldwide Acquisitions
TriStar Productions
Walt Disney Studios Sony Pictures Releasing
Sony Pictures India
Sony Pictures Networks Productions
Sony Pictures Core
SonyLIV
Crunchyroll
Funimation
Great American Pure Flix
"""

In [145]:
Columbia = Columbia.split("\n")

Columbia.pop(0)
Columbia.pop(-1)

for i in Columbia:
    df["production_companies"] = df["production_companies"].apply(
        lambda x: x.replace(i, "Columbia Pictures")
    )

In [None]:
# Philo (minority stake)
# Paramount Pictures
# Warner Bros. Pictures
# Walt Disney Pictures

In [146]:
com = []

for l in df["production_companies"]:
    for ll in l.split(", "):
        com.append(ll)

company_counts = Counter(com)
company_counts_dict = dict(company_counts)
sorted_company_counts = dict(
    sorted(company_counts_dict.items(), key=lambda item: item[1], reverse=True)
)

sorted_company_counts

{'Warner Bros. Pictures': 957,
 'Walt Disney Pictures': 853,
 'Universal Pictures': 829,
 'Columbia Pictures': 736,
 'Paramount': 671,
 'Metro-Goldwyn-Mayer': 369,
 'Touchstone Pictures': 165,
 'Lionsgate': 154,
 'United Artists': 144,
 'DreamWorks Pictures': 123,
 'Relativity Media': 123,
 'Summit Entertainment': 122,
 'Village Roadshow Pictures': 98,
 'StudioCanal': 96,
 'Canal+': 89,
 'Film4 Productions': 84,
 'Dimension Films': 77,
 'Fox Walt Disney Pictures': 77,
 'Amblin Entertainment': 75,
 'Orion Pictures': 72,
 'Dune Entertainment': 69,
 'Blumhouse Productions': 66,
 'The Weinstein Company': 65,
 'Imagine Entertainment': 64,
 'Millennium Media': 63,
 'TSG Entertainment': 63,
 'Fox 2000 Pictures': 61,
 'BBC Film': 58,
 'Mandate International': 57,
 'Hollywood Pictures': 56,
 'Scott Rudin Productions': 56,
 'New Regency Pictures': 55,
 'Ingenious Media': 55,
 'Sony Pictures': 53,
 'RKO Radio Pictures': 51,
 'PolyGram Filmed Entertainment': 51,
 'Silver Pictures': 50,
 'Original 

In [159]:
"""
{'Warner Bros. Pictures': 957,
 'Walt Disney Pictures': 853,
 'Universal Pictures': 829,
 'Columbia Pictures': 736,
 'Paramount': 671,
"""

df["production_companies_Warner"] = 0
df["production_companies_Disney"] = 0
df["production_companies_Universal"] = 0
df["production_companies_Columbia"] = 0
df["production_companies_Paramount"] = 0
df["production_companies_other"] = 0

for index, row in df.iterrows():
    if "Warner Bros. Pictures" in str(row["production_companies"]):
        df.at[index, "production_companies_Warner"] += 1

    if "Walt Disney Pictures" in str(row["production_companies"]):
        df.at[index, "production_companies_Disney"] += 1

    if "Universal Pictures" in str(row["production_companies"]):
        df.at[index, "production_companies_Warner"] += 1

    if "Columbia Pictures" in str(row["production_companies"]):
        df.at[index, "production_companies_Universal"] += 1

    if "Paramount" in str(row["production_companies"]):
        df.at[index, "production_companies_Paramount"] += 1

    if "" in str(row["production_companies"]):
        df.at[index, "production_companies_other"] += 1

    #     df.at[index, "spoken_languages_other"] += row["spoken_languages"].count(",")
    # else:
    #     df.at[index, "spoken_languages_other"] += row["spoken_languages"].count(",")
    #     df.at[index, "spoken_languages_other"] += 1

Warner Bros. Pictures
<built-in method split of str object at 0x000001C5D37DCA80>


TypeError: object of type 'builtin_function_or_method' has no len()

4046

In [150]:
df["production_companies"].unique

<bound method Series.unique of 912                             Independent Moving Pictures
1327                    Jesse L. Lasky Feature Play Company
1489                                         Epoch Film Co.
1555         Jesse L. Lasky Feature Play Company, Paramount
1676                    Jesse L. Lasky Feature Play Company
1962                    Jesse L. Lasky Feature Play Company
2022                            The Universal Film Mfg. Co.
2245        Triangle Film Corporation, Wark Producing Corp.
2330                    Jesse L. Lasky Feature Play Company
3162                           Herbert M. Dawley Production
3285                     Mabel Normand Feature Film Company
3734                 Paramount, Mayflower Photoplay Company
4174                                   Fox Film Corporation
4475                       Famous Players-Lasky Corporation
4548                            Charles Chaplin Productions
4881                   Universal Film Manufacturing Company
5494     

In [None]:
df = df.dropna(subset=["production_countries"])

In [None]:
aN = [
    "vote_average",
    "vote_count",
    "revenue",
    "runtime",
    "isAdult",
    "budget",
    "popularity",
    "rating",
    "numVotes",
    "oscarNominations",
    "oscarWinner",
    "isActingWinner",
]

In [None]:
# 創建相關矩陣
#  'category' 依類別上整數標籤
correlation_matrix = df[aN].corr()

# 繪製熱力圖
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
aN.remove("revenue")
X = df[aN]
Y = df["revenue"]

print(X.shape)
print(Y.shape)

In [None]:
# 創建 MinMaxScaler 對象
scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()

# 對 X 和 Y 進行 Min-Max 歸一化
X_scaled = scaler_X.fit_transform(X)
Y_scaled = scaler_Y.fit_transform(Y.values.reshape(-1, 1))

# 如果需要，你可以將歸一化後的數據轉換回原始尺度
X_original_scale = scaler_X.inverse_transform(X_scaled)
Y_original_scale = scaler_Y.inverse_transform(Y_scaled)

In [None]:
# 將 X 和 Y 合併成一個 DataFrame
normalized_data = pd.DataFrame(
    np.concatenate([X_scaled, Y_scaled], axis=1), columns=list(X.columns) + ["Y"]
)

# 計算相關係數矩陣
correlation_matrix = normalized_data.corr()

# 設定圖形大小
plt.figure(figsize=(12, 10))

# 繪製熱力圖
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

# 顯示圖形
plt.show()

In [None]:
def evaluate_model(model, X_test, y_test):
    # 進行預測
    y_pred = model.predict(X_test)

    # 計算評估指標
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # 顯示評估結果
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")

In [None]:
def train_random_forest(X, y):
    # 拆分數據集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=87
    )

    # 設定要嘗試的超參數範圍
    param_grid = {
        "n_estimators": [100, 200, 300],  # 樹的數量
        "max_features": ["sqrt", "log2"],  # 劃分時考慮的最大特徵數
        "max_depth": [4, 6, 8, 10],  # 樹的最大深度
        "criterion": ["squared_error"],  # 劃分樹時的評估標準
    }

    # 創建隨機森林模型
    rf_model = RandomForestRegressor(random_state=87)

    # 使用 GridSearchCV 進行超參數搜尋
    grid_search = GridSearchCV(
        estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=3, verbose=2
    )

    # 適用於訓練集
    grid_search.fit(X_train, y_train)

    # 輸出最佳參數和評分
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)
    print()
    # 可選：返回訓練好的模型和測試數據
    # return grid_search, X_test, y_test
    evaluate_model(grid_search.best_estimator_, X_test, y_test)


# 使用範例：
# 假設 X 和 y 是你的特徵和目標變數
# trained_model, X_test, y_test = train_random_forest(X, y)

In [None]:
train_random_forest(X, Y)

In [None]:
train_random_forest(X_scaled, Y_scaled)

In [None]:
# 這個會跑很久!!!

%%time
# 級距分類

from sklearn.model_selection import GridSearchCV

# 拆分數據集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=87)

# 設定要嘗試的超參數範圍
param_grid = {
    'n_estimators': [100, 200, 300],  # 樹的數量
    'max_features': ['sqrt', 'log2'],  # 劃分時考慮的最大特徵數
    'max_depth' : [4, 6, 8, 10],  # 樹的最大深度
    'criterion' :['gini', 'entropy']  # 劃分樹時的評估標準
}

# 創建隨機森林模型
rf_model = RandomForestClassifier(random_state=87)

# 使用 GridSearchCV 進行超參數搜尋
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=5, verbose=2)

# 適用於訓練集
grid_search.fit(X_train, y_train)

# 輸出最佳參數和評分
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
# 使用最佳模型進行預測
y_pred = grid_search.best_estimator_.predict(X_test)

# 計算各項評估指標
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# 假設 X 和 y 是您的特徵和目標變量
y = df['4']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 設定要嘗試的超參數範圍 (只選擇一個選項來減少計算時間)
param_grid = {
    'n_estimators': [100],  # 只選擇一個樹的數量
    'max_features': ['sqrt'],  # 只選擇一個劃分時考慮的最大特徵數選項
    'max_depth' : [4],  # 只選擇一個樹的最大深度
    'criterion' :['poisson']  # 只選擇一個劃分樹時的評估標準
}

# 創建隨機森林模型
rf_model = RandomForestRegressor(random_state=87)

# 使用 GridSearchCV 進行超參數搜尋
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=4, verbose=2)

# 適用於訓練集
grid_search.fit(X_train, y_train)

# 輸出最佳參數和評分
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)