# Regressão Logística
<p>Aplicação da técnica de regressão logística para analisar a popularidade dos filmes entre 2000 a 2024</p>

In [45]:
#Importação das bibliotecas
import pandas as pd
import numpy as ny
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import requests
import json
from imdb import IMDb
import re
import concurrent.futures

In [2]:
def createDateFrame(fileCSV):
    dataFrame = pd.read_csv(fileCSV)
    return dataFrame

In [3]:
def transformY(data):
   data["popularidade"] = (data["popularidade"] > data["popularidade"].mean()).astype(int) 

In [4]:
def dateXY(data, columns):
    y = data["popularidade"]
    x = data[columns]
    return train_test_split(x, y, test_size=0.3, random_state=42)

In [5]:
def createModel(x_train, y_train):
    model = LogisticRegression(max_iter=200)
    model.fit(x_train, y_train)
    return model

In [6]:
df = createDateFrame("../T2/Transform.csv")
transformY(df)

In [7]:
def result_metrics(y_test, x_test, pred, model):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    conf_matrix = confusion_matrix(y_test, pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
    print(f"Acurácia: {accuracy:.2f}")
    print(f"Precisão: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print("Matriz de Confusão:\n", conf_matrix)
    print(f"AUC: {roc_auc:.2f}")

In [8]:
def logit(columns):
    X_train, X_test, Y_train, Y_test = dateXY(df, columns)
    model = createModel(X_train, Y_train)
    Y_pred = model.predict(X_test)
    result_metrics(Y_test, X_test, Y_pred, model)

In [9]:
logit(['orcamento', 'receita', 'duracao', 'voto_popular', 'avaliacao_da_critica'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.55
F1 Score: 0.60
Matriz de Confusão:
 [[48 31]
 [46 57]]
AUC: 0.61


In [10]:
logit(['orcamento', 'receita', 'duracao', 'voto_popular'])

Acurácia: 0.57
Precisão: 0.63
Recall: 0.55
F1 Score: 0.59
Matriz de Confusão:
 [[46 33]
 [46 57]]
AUC: 0.61


In [11]:
logit(['orcamento', 'receita', 'duracao'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.58


In [12]:
logit(['orcamento', 'receita'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.57


In [13]:
logit(['orcamento'])

Acurácia: 0.58
Precisão: 0.65
Recall: 0.54
F1 Score: 0.59
Matriz de Confusão:
 [[49 30]
 [47 56]]
AUC: 0.56


## Extraindo dados adicionais para a realização da análise

In [18]:
tokenTMDB = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZjZiNzNmOGE3NmNjZjA0OWU5OTQ2MzRhNWEyYjI3MyIsIm5iZiI6MTcyNDM4MzY5OC4wNzE4NjEsInN1YiI6IjY2YzYxN2YzNTk2MWNlZTg3ZTY5ZWQzYSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.ogQ7duMdbP16GPUyGSB5E200SjzropEXsuZUvgxxVzs"
def getProvidersMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/watch/providers".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [33]:
for index, movie in df.iterrows():
    data = getProvidersMovie(tokenTMDB, movie['id'])
    i = 0
    providers = []
    for country in data['results']:
        if 'flatrate' in data['results'][country]:
            for plataform in data['results'][country]['flatrate']:
                if(not(plataform['provider_name'] in providers)):
                    i+=1
                    providers.append(plataform['provider_name'])
    df.at[index, 'total_streaming'] = i

In [35]:
def getReleaseDate(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}/release_dates".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [36]:
for index, movie in df.iterrows():
    data = getReleaseDate(tokenTMDB, movie['id'])
    i = len(data['results'])
    df.at[index, 'total_countries'] = i

In [39]:
def getDetailsMovie(token, ID):
    try:
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer {}".format(token)
        }
        url = "https://api.themoviedb.org/3/movie/{}?language=pt-BR".format(ID)
        response = requests.get(url, headers=headers)
        data = response.json()
        return data
    except requests.exceptions.RequestException:
        print("Error fetching data 1")
    except json.decoder.JSONDecodeError:
        print("Error decoding JSON")
    except KeyError:
        print("keyError 1")

In [41]:
def extrair_total_premios(awards_string):
    won_matches = re.findall(r'Won (\d+)', awards_string)
    another_matches = re.findall(r'Another (\d+)', awards_string)
    won_count = sum(int(match) for match in won_matches) if won_matches else 0
    another_count = sum(int(match) for match in another_matches) if another_matches else 0
    return won_count + another_count


In [46]:
api = IMDb()

def process_movie(movie):
    data = getDetailsMovie(tokenTMDB, movie['id'])
    imdb_id = data['imdb_id'][2:] 
    awards_data = api.get_movie(imdb_id)
    awards = awards_data.get('awards', None)
    if awards:
        return extrair_total_premios(awards)
    else:
        return 0

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_movie, movie) for _, movie in df.iterrows()]
    for index, future in enumerate(concurrent.futures.as_completed(futures)):
        df.at[index, 'total_awards'] = future.result()

In [47]:
df['total_awards']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
601    0.0
602    0.0
603    0.0
604    0.0
605    0.0
Name: total_awards, Length: 606, dtype: float64

In [67]:
awards = api.get_movie(2096673)
api.update(awards, info=['main', 'awards'])

In [69]:
print(awards['awards'])

KeyError: 'awards'