# Regressão Logística
<p>Aplicação da técnica de regressão logística para analisar a popularidade dos filmes entre 2000 a 2024</p>

In [19]:
#Importação das bibliotecas
import pandas as pd
import numpy as ny
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import requests
import json
import re
import concurrent.futures

In [20]:
def createDateFrame(fileCSV):
    dataFrame = pd.read_csv(fileCSV)
    return dataFrame

In [21]:
def transformY(data):
   data["popularidade"] = (data["popularidade"] > data["popularidade"].mean()).astype(int) 

In [22]:
def dateXY(data, columns):
    y = data["popularidade"]
    x = data[columns]
    return train_test_split(x, y, test_size=0.3, random_state=42)

In [23]:
def createModel(x_train, y_train):
    model = LogisticRegression(max_iter=200)
    model.fit(x_train, y_train)
    return model

In [24]:
df = createDateFrame("../T2/Transform.csv")
transformY(df)

In [25]:
def result_metrics(y_test, x_test, pred, model):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    conf_matrix = confusion_matrix(y_test, pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Matriz de Confusão:\n", conf_matrix)
    print(f"AUC: {roc_auc:.2f}")

In [26]:
def logit(columns):
    X_train, X_test, Y_train, Y_test = dateXY(df, columns)
    model = createModel(X_train, Y_train)
    Y_pred = model.predict(X_test)
    result_metrics(Y_test, X_test, Y_pred, model)

In [27]:
logit(['orcamento', 'receita', 'duracao', 'voto_popular', 'avaliacao_da_critica'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.55
F1 Score: 0.60
Matriz de Confusão:
 [[48 31]
 [46 57]]
AUC: 0.61


In [28]:
logit(['orcamento', 'receita', 'duracao', 'voto_popular'])

Acurácia: 0.57
Precisão: 0.63
Recall: 0.55
F1 Score: 0.59
Matriz de Confusão:
 [[46 33]
 [46 57]]
AUC: 0.61


In [29]:
logit(['orcamento', 'receita', 'duracao'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.58


In [30]:
logit(['orcamento', 'receita'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.57


In [31]:
logit(['orcamento'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.56


## Extraindo dados adicionais para a realização da análise

In [32]:
tokenTMDB = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZjZiNzNmOGE3NmNjZjA0OWU5OTQ2MzRhNWEyYjI3MyIsIm5iZiI6MTcyNDM4MzY5OC4wNzE4NjEsInN1YiI6IjY2YzYxN2YzNTk2MWNlZTg3ZTY5ZWQzYSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.ogQ7duMdbP16GPUyGSB5E200SjzropEXsuZUvgxxVzs"
def getProvidersMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/watch/providers".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [33]:
for index, movie in df.iterrows():
    data = getProvidersMovie(tokenTMDB, movie['id'])
    i = 0
    providers = []
    for country in data['results']:
        if 'flatrate' in data['results'][country]:
            for plataform in data['results'][country]['flatrate']:
                if(not(plataform['provider_name'] in providers)):
                    i+=1
                    providers.append(plataform['provider_name'])
    df.at[index, 'total_streaming'] = i

In [34]:
def getReleaseDate(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/release_dates".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [35]:
for index, movie in df.iterrows():
    data = getReleaseDate(tokenTMDB, movie['id'])
    i = len(data['results'])
    df.at[index, 'total_countries'] = i

In [36]:
def getDetailsMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}?language=pt-BR".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [83]:
def extrair_detalhes_premios(awards_string):
    if not awards_string:
        return {
            'total_wins': 0,
            'total_nominations': 0,
            'oscar_wins': False
        }

    # Captura o número de prêmios ganhos (combina "X wins", "X win" ou "Won X awards")
    wins = re.findall(r'(\d+) wins?|Won (\d+)', awards_string)
    total_wins = sum(int(won or win) for win, won in wins if win or won)

    # Captura o número de indicações
    nominations = re.search(r'(\d+) nominations?', awards_string)
    total_nominations = int(nominations.group(1)) if nominations else 0

    # Captura o número de Oscars ganhos
    oscar = re.search(r'Won (\d+) Oscar', awards_string)
    oscar_wins = int(oscar.group(1)) if oscar else 0

    return {
        'total_wins': total_wins,
        'total_nominations': total_nominations,
        'oscar_wins': oscar_wins > 0
    }

In [38]:
def getMovieOMDB(keyAPI, ID):
    try:
        url = "http://www.omdbapi.com/?i={}&plot=full&apikey={}".format(ID,keyAPI)
        response = requests.get(url)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 2")
    except json.decoder.JSONDecodeError:
        print("Errod decoding JSON")
    except KeyError:
        print("keyError 2")

In [99]:
tokenOMDB = "d29535f4"

def process_movie(movie):
    data = getDetailsMovie(tokenTMDB, movie['id'])
    imdb_id = data['imdb_id'] 
    awards_data = getMovieOMDB(tokenOMDB, imdb_id)
    return extrair_detalhes_premios(awards_data['Awards'])

# Armazena os resultados em um dicionário para associar o índice ao resultado
results = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = {executor.submit(process_movie, movie): index for index, (_, movie) in enumerate(df.iterrows())}
    
    for future in concurrent.futures.as_completed(futures):
        index = futures[future]  # Obtém o índice correspondente
        try:
            awards_data = future.result()
            results[index] = awards_data  # Armazena o resultado no dicionário
        except Exception as exc:
            print(f"Index {index} generated an exception: {exc}")

# Atualiza o DataFrame com os resultados armazenados
for index, awards_data in results.items():
    df.at[index, 'total_awards'] = awards_data.get('total_wins', 0)
    df.at[index, 'total_nominations'] = awards_data.get('total_nominations', 0)
    df.at[index, 'oscar_wins'] = awards_data.get('oscar_wins', False)

In [104]:
def getLanguages(token, ID):
  try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/translations".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
  except requests.exceptions.RequestException:
        print("Error fetching data 1")
  except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
  except KeyError:
        print("keyError 1")

In [105]:
for index, movie in df.iterrows():
    data = getLanguages(tokenTMDB, movie['id'])
    i = len(data['translations'])
    df.at[index, 'total_languages'] = i

In [107]:
def createCSV(data, filename):
    dataFrame = pd.DataFrame(data)
    dataFrame.to_csv(filename, index= False)

In [108]:
createCSV(df, "Dataset.csv")