In [1]:
import pandas as pd

# Cargar el archivo CSV en un DataFrame de pandas
file_path = 'C:/Users/HP/OneDrive - Universidad Nacional de Costa Rica/Escritorio/Dataset_analisis/All Ended Netflix Original Shows.csv'
data = pd.read_csv(file_path)

# Mostrar las primeras filas del DataFrame para inspección
data.head()



Unnamed: 0,Title,Genre,Premiere Date,Finale Date,Seasons,Cancelled
0,House of Cards,Political drama,"February 1, 2013","November 2, 2018",6,0
1,Orange Is the New Black,Comedy drama,"July 11, 2013","July 26, 2019",7,0
2,Marco Polo,Historical drama,"December 12, 2014","July 1, 2016",2,0
3,Bloodline,Thriller,"March 20, 2015","May 26, 2017",3,0
4,Daredevil,Superherolegal drama,"April 10, 2015","October 19, 2018",3,0


In [2]:

data['Genre'] = data['Genre'].fillna('Unknown')

# Verificar si la conversión de fecha introdujo nuevos valores nulos
#data['Premiere Date'] = pd.to_datetime(data['Premiere Date'], errors='coerce')
#data['Finale Date'] = pd.to_datetime(data['Finale Date'], errors='coerce')

# Verificar si la conversión de fecha introdujo nuevos valores nulos
#new_nulls_after_conversion = data.isnull().sum()
#new_nulls_after_conversion

null_finale_date_rows = data[data['Genre'].notnull()]

# Mostrar las filas que tienen el Genre unknown
data[data['Genre'] == 'Unknown']

#null_finale_date_rows

Unnamed: 0,Title,Genre,Premiere Date,Finale Date,Seasons,Cancelled
302,All Hail King Julien,Unknown,"December 19, 2014","December 1, 2017",5,0
303,Ever After High,Unknown,"February 6, 2015","August 5, 2016",5,0
304,Dragons: Race to the Edge,Unknown,"June 26, 2015","February 16, 2018",6,0
305,Popples,Unknown,"October 30, 2015","July 24, 2016",3,0
306,Care Bears and Cousins,Unknown,"November 6, 2015","February 5, 2016",2,0
...,...,...,...,...,...,...
1330,VeggieTales in the City,Unknown,"February 24, 2017","September 15, 2017",2,0
1331,Spirit Riding Free,Unknown,"May 5, 2017","April 5, 2019",8,0
1332,Dinotrux Supercharged,Unknown,"November 10, 2017","August 3, 2018",3,0
1333,Ask the StoryBotsKorea Edition,Unknown,"May 5, 2021","May 5, 2021",1,1


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE  # para técnica de sobremuestreo
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans  # algoritmo no supervisado
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Asumiendo que 'data' es tu DataFrame después de la limpieza

# Codificación One-Hot para variables categóricas
encoder = OneHotEncoder(drop='first')  # drop='first' para evitar la multicolinealidad
genre_encoded = encoder.fit_transform(data[['Genre']]).toarray()
genre_encoded_df = pd.DataFrame(genre_encoded, columns=encoder.get_feature_names_out(['Genre']))

# Concatenar con el conjunto de datos original y eliminar la columna categórica original
data = pd.concat([data, genre_encoded_df], axis=1).drop(columns=['Genre', 'Title', 'Premiere Date', 'Finale Date'])  # eliminamos las columnas que no necesitamos

# Dividir el conjunto de datos en características (X) y variable objetivo (y)
X = data.drop(columns=['Cancelled'])
y = data['Cancelled']

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # ajusta test_size según tus necesidades

# Aplicar SMOTE para manejar el desequilibrio de clases
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Algoritmos supervisados
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

# SVM
svm = SVC(random_state=42)
svm.fit(X_train_smote, y_train_smote)
y_pred_svm = svm.predict(X_test)
print("SVM:")
print(classification_report(y_test, y_pred_svm))

# KNN
knn = KNeighborsClassifier()
knn.fit(X_train_smote, y_train_smote)
y_pred_knn = knn.predict(X_test)
print("KNN:")
print(classification_report(y_test, y_pred_knn))

# Algoritmo no supervisado - KMeans
kmeans = KMeans(n_clusters=2, random_state=42)  # elige el número de clusters apropiado
kmeans.fit(X)
data['cluster'] = kmeans.labels_

# Ajuste de hiperparámetros (usaremos Random Forest como ejemplo)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_smote, y_train_smote)
best_grid = grid_search.best_estimator_

y_pred_best_grid = best_grid.predict(X_test)
print("Random Forest con ajuste de hiperparámetros:")
print(classification_report(y_test, y_pred_best_grid))

Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        64
           1       0.99      1.00      1.00       207

    accuracy                           0.99       271
   macro avg       1.00      0.98      0.99       271
weighted avg       0.99      0.99      0.99       271

SVM:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        64
           1       0.99      1.00      1.00       207

    accuracy                           0.99       271
   macro avg       1.00      0.98      0.99       271
weighted avg       0.99      0.99      0.99       271

KNN:
              precision    recall  f1-score   support

           0       0.57      0.94      0.71        64
           1       0.98      0.78      0.87       207

    accuracy                           0.82       271
   macro avg       0.77      0.86      0.79       271
weighted avg       0.88      0.82      0.83      

  super()._check_params_vs_input(X, default_n_init=10)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random Forest con ajuste de hiperparámetros:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        64
           1       0.99      1.00      1.00       207

    accuracy                           0.99       271
   macro avg       1.00      0.98      0.99       271
weighted avg       0.99      0.99      0.99       271

