# Regressão Logística
<p>Aplicação da técnica de regressão logística para analisar a popularidade dos filmes entre 2000 a 2024</p>

In [2]:
#Importação das bibliotecas
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import requests
import json
import re
import concurrent.futures
import country_converter as cc

In [3]:
def createDateFrame(fileCSV):
    dataFrame = pd.read_csv(fileCSV)
    return dataFrame

In [4]:
def transformY(data):
   data["popularidade"] = (data["popularidade"] > data["popularidade"].mean()).astype(int) 

In [5]:
def dateXY(data, columns):
    y = data["popularidade"]
    x = data[columns]
    return train_test_split(x, y, test_size=0.3, random_state=42)

In [6]:
def createModel(x_train, y_train):
    model = LogisticRegression(max_iter=100000)
    model.fit(x_train, y_train)
    return model

In [7]:
df = createDateFrame("../T2/Transform.csv")
transformY(df)

In [8]:
def result_metrics(y_test, x_test, pred, model):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    conf_matrix = confusion_matrix(y_test, pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Matriz de Confusão:\n", conf_matrix)
    print(f"AUC: {roc_auc:.2f}")
    return accuracy

In [9]:
def logit(data, columns):
    X_train, X_test, Y_train, Y_test = dateXY(data, columns)
    model = createModel(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return result_metrics(Y_test, X_test, Y_pred, model)

In [10]:
logit(df, ['orcamento', 'receita', 'duracao', 'voto_popular', 'avaliacao_da_critica'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.55
F1 Score: 0.60
Matriz de Confusão:
 [[48 31]
 [46 57]]
AUC: 0.61


0.5769230769230769

In [11]:
logit(df, ['orcamento', 'receita', 'duracao', 'voto_popular'])

Acurácia: 0.57
Precisão: 0.63
Recall: 0.55
F1 Score: 0.59
Matriz de Confusão:
 [[46 33]
 [46 57]]
AUC: 0.61


0.5659340659340659

In [12]:
logit(df, ['orcamento', 'receita', 'duracao'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.58


0.5769230769230769

In [13]:
logit(df, ['orcamento', 'receita'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.57


0.5769230769230769

In [14]:
logit(df, ['orcamento'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.56


0.5769230769230769

## Extraindo dados adicionais para a realização da análise

In [15]:
tokenTMDB = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZjZiNzNmOGE3NmNjZjA0OWU5OTQ2MzRhNWEyYjI3MyIsIm5iZiI6MTcyNDM4MzY5OC4wNzE4NjEsInN1YiI6IjY2YzYxN2YzNTk2MWNlZTg3ZTY5ZWQzYSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.ogQ7duMdbP16GPUyGSB5E200SjzropEXsuZUvgxxVzs"
def getProvidersMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/watch/providers".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [16]:
for index, movie in df.iterrows():
    data = getProvidersMovie(tokenTMDB, movie['id'])
    i = 0
    providers = []
    for country in data['results']:
        if 'flatrate' in data['results'][country]:
            for plataform in data['results'][country]['flatrate']:
                if(not(plataform['provider_name'] in providers)):
                    i+=1
                    providers.append(plataform['provider_name'])
    df.at[index, 'total_streaming'] = i

In [17]:
def getReleaseDate(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/release_dates".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [18]:
for index, movie in df.iterrows():
    data = getReleaseDate(tokenTMDB, movie['id'])
    i = len(data['results'])
    df.at[index, 'total_countries'] = i

In [19]:
def getDetailsMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}?language=pt-BR".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [20]:
def extrair_detalhes_premios(awards_string):
    if not awards_string:
        return {
            'total_wins': 0,
            'total_nominations': 0,
            'oscar_wins': False
        }

    # Captura o número de prêmios ganhos (combina "X wins", "X win" ou "Won X awards")
    wins = re.findall(r'(\d+) wins?|Won (\d+)', awards_string)
    total_wins = sum(int(won or win) for win, won in wins if win or won)

    # Captura o número de indicações
    nominations = re.search(r'(\d+) nominations?', awards_string)
    total_nominations = int(nominations.group(1)) if nominations else 0

    # Captura o número de Oscars ganhos
    oscar = re.search(r'Won (\d+) Oscar', awards_string)
    oscar_wins = int(oscar.group(1)) if oscar else 0

    return {
        'total_wins': total_wins,
        'total_nominations': total_nominations,
        'oscar_wins': oscar_wins > 0
    }

In [21]:
def getMovieOMDB(keyAPI, ID):
    try:
        url = "http://www.omdbapi.com/?i={}&plot=full&apikey={}".format(ID,keyAPI)
        response = requests.get(url)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 2")
    except json.decoder.JSONDecodeError:
        print("Errod decoding JSON")
    except KeyError:
        print("keyError 2")

In [22]:
tokenOMDB = "d29535f4"

def process_movie(movie):
    data = getDetailsMovie(tokenTMDB, movie['id'])
    imdb_id = data['imdb_id'] 
    awards_data = getMovieOMDB(tokenOMDB, imdb_id)
    return extrair_detalhes_premios(awards_data['Awards'])

# Armazena os resultados em um dicionário para associar o índice ao resultado
results = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_movie, movie): index for index, (_, movie) in enumerate(df.iterrows())}
    
    for future in concurrent.futures.as_completed(futures):
        index = futures[future]  # Obtém o índice correspondente
        try:
            awards_data = future.result()
            results[index] = awards_data  # Armazena o resultado no dicionário
        except Exception as exc:
            print(f"Index {index} generated an exception: {exc}")

# Atualiza o DataFrame com os resultados armazenados
for index, awards_data in results.items():
    df.at[index, 'total_awards'] = awards_data.get('total_wins', 0)
    df.at[index, 'total_nominations'] = awards_data.get('total_nominations', 0)
    df.at[index, 'oscar_wins'] = awards_data.get('oscar_wins', False)

In [23]:
def getLanguages(token, ID):
  try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/translations".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
  except requests.exceptions.RequestException:
        print("Error fetching data 1")
  except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
  except KeyError:
        print("keyError 1")

In [24]:
for index, movie in df.iterrows():
    data = getLanguages(tokenTMDB, movie['id'])
    i = len(data['translations'])
    df.at[index, 'total_languages'] = i

In [25]:
def createCSV(data, filename):
    dataFrame = pd.DataFrame(data)
    dataFrame.to_csv(filename, index= False)

In [26]:
createCSV(df, "Dataset.csv")

## Testando o modelo com novas variáveis

In [27]:
df = createDateFrame("Dataset.csv")

In [28]:
bins = [1940, 1970, 2000, 2030]
labels = ['1940-1969', '1970-1999', '2000-2029']
df['periodo'] = pd.cut(df['ano'], bins=bins, labels=labels, right=False)

In [29]:
df['continente'] = cc.convert(df['pais'], to='continent')

More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United States, United Kingdom, Australia, New Zealand, Canada
More than one regular expression match for United State

In [30]:
df['continente']

0      [Oceania, America, Oceania, Europe, America]
1                                   [Asia, America]
2                                           America
3                                           America
4                                           America
                           ...                     
601                              [America, America]
602                                         America
603                                         America
604                                         America
605                         [Europe, Asia, America]
Name: continente, Length: 606, dtype: object

In [31]:
def clean_continents(continents):
    if isinstance(continents, str):
        continents = [continents]
    valid_continents = {"Africa", "America", "Asia", "Europe", "Oceania"}
    cleaned = list(set(continent for continent in continents if continent in valid_continents))
    return cleaned

df['continente'] = df['continente'].apply(clean_continents)

In [32]:
dummies_periodo = pd.get_dummies(df['periodo'], prefix='periodo', drop_first=True)

dummies_continente = pd.get_dummies(df['continente'].explode(), prefix='continente', drop_first=True).groupby(level=0).sum()

df = pd.concat([df, dummies_periodo, dummies_continente], axis=1)

## Verificando suposições da técnica de Regressão Logística

In [33]:
X = df.drop(['popularidade', 'id', 'genero', 'diretor', 'titulo', 'produtoras', 'pais', 'ano', 'continente', 'periodo', 'periodo_1970-1999'], axis=1)
X = X.astype({col: 'int' for col in X.select_dtypes('bool').columns})
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data['Variável'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)

                Variável           VIF
0                  const  14443.808858
1           voto_popular      2.154550
2              orcamento      3.156374
3                receita      3.207454
4                duracao      1.402760
5   avaliacao_da_critica      1.987100
6        total_streaming      1.571255
7        total_countries      1.472893
8           total_awards      3.943948
9      total_nominations      3.728261
10            oscar_wins      1.941382
11       total_languages      2.466788
12     periodo_2000-2029      1.504577
13    continente_America      1.497404
14       continente_Asia      1.192537
15     continente_Europe      1.051707
16    continente_Oceania      1.043640


In [34]:
varsX = X.columns
varsX = varsX[1:]
print(varsX)

Index(['voto_popular', 'orcamento', 'receita', 'duracao',
       'avaliacao_da_critica', 'total_streaming', 'total_countries',
       'total_awards', 'total_nominations', 'oscar_wins', 'total_languages',
       'periodo_2000-2029', 'continente_America', 'continente_Asia',
       'continente_Europe', 'continente_Oceania'],
      dtype='object')


In [35]:

independent_vars = ['voto_popular', 'orcamento', 'receita', 'duracao',
       'avaliacao_da_critica', 'total_streaming', 'total_countries',
       'total_awards', 'total_nominations', 'oscar_wins', 'total_languages',
       'periodo_2000-2029', 'continente_America', 'continente_Asia',
       'continente_Europe', 'continente_Oceania']

In [36]:
logit(df, ['orcamento', 'receita', 'duracao', 'avaliacao_da_critica', 'total_streaming', 'total_countries', 'total_awards', 'oscar_wins', 'total_languages', 'continente_Asia'])

Acurácia: 0.64
Precisão: 0.71
Recall: 0.63
F1 Score: 0.67
Matriz de Confusão:
 [[52 27]
 [38 65]]
AUC: 0.67


0.6428571428571429

In [37]:
logit(df, ['orcamento', 'receita', 'duracao', 'avaliacao_da_critica', 'total_streaming', 'total_countries', 'total_awards', 'total_languages', 'continente_America', 'continente_Europe'])

Acurácia: 0.65
Precisão: 0.72
Recall: 0.62
F1 Score: 0.67
Matriz de Confusão:
 [[54 25]
 [39 64]]
AUC: 0.67


0.6483516483516484