In [None]:
# Importar módulos
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,accuracy_score,roc_curve,confusion_matrix

In [None]:
# Ler datasets
from google.colab import drive
drive.mount('/content/drive')

instagram_df_train = pd.read_csv('/content/drive/MyDrive/insta_train.csv', sep = ',')
instagram_df_test = pd.read_csv('/content/drive/MyDrive/insta_test.csv', sep = ',')

In [None]:
instagram_df_train

In [None]:
instagram_df_test

In [None]:
#Análise exploratória de dados (EDA)

# Informação acerca do dataset
instagram_df_train.info()
instagram_df_test.info()

In [None]:
# Sumário estatístico do dataset
instagram_df_train.describe()

In [None]:
# Verificar se há valores nulos
instagram_df_train.isnull().sum()

In [None]:
# Número de 0 ou 1 no atributo 'profile pic'
instagram_df_train['profile pic'].value_counts()

In [None]:
# Número de 0 ou 1 no atributo 'fake' (saída desejada)
instagram_df_train['fake'].value_counts()

In [None]:
# Dados de teste
instagram_df_test.describe()

In [None]:
instagram_df_test.isnull().sum()

In [None]:
instagram_df_test['fake'].value_counts()

In [None]:
# Visualização de dados

# Visualizar o atributo 'fake'
sns.countplot(x = instagram_df_train['fake'])
plt.show()

In [None]:
# Visualizar o atributo 'private'
sns.countplot(x = instagram_df_train['private'])
plt.show()

In [None]:
# Visualizar o atributo 'profile pic'
sns.countplot(x = instagram_df_train['profile pic'])
plt.show()

In [None]:
# Histograma do atributo 'nums/length username'
plt.figure(figsize = (20, 10))
sns.histplot(instagram_df_train['nums/length username'])
plt.show()

In [None]:
# Matriz de correlação
plt.figure(figsize=(20, 20))
cm = instagram_df_train.corr()
ax = plt.subplot()
sns.heatmap(cm, annot = True, ax = ax)
plt.show()

In [None]:
# Dados de teste
sns.countplot(x = instagram_df_test['fake'])

In [None]:
sns.countplot(x = instagram_df_test['private'])

In [None]:
sns.countplot(x = instagram_df_test['profile pic'])

In [None]:
# Preparação dos dados para o treino do modelo

# Variáveis de entrada
X_train = instagram_df_train.drop(columns = ['fake'])
X_train

In [None]:
X_test = instagram_df_test.drop(columns = ['fake'])
X_test

In [None]:
# Saída desejada
Y_train = instagram_df_train['fake']
Y_train

In [None]:
Y_test = instagram_df_test['fake']
Y_test

In [None]:
# Escalar os dados

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler_x = StandardScaler()

X_train = scaler_x.fit_transform(X_train)  # Centrar e reduzir (tirar a média e dividir pelo desvio padrão)
X_test = scaler_x.transform(X_test)

In [None]:
y_train = tf.keras.utils.to_categorical(Y_train, num_classes = 2)  # Transformar os dados numa matriz de classificação binária
y_test = tf.keras.utils.to_categorical(Y_test, num_classes = 2)

y_train

In [None]:
# Número de linhas e colunas dos datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Percentagem de dados de treino
Training_data = len(X_train)/( len(X_test) + len(X_train) ) * 100
Training_data

In [None]:
# Percentagem de dados de teste
Testing_data = len(X_test)/( len(X_test) + len(X_train) ) * 100
Testing_data

In [None]:
# Construção e treino do modelo

import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()  # Criar rede neuronal
model.add(Dense(50, input_dim=11, activation='relu'))  #add: adiciona uma camada; dense: cada neurónio da camada seguinte recebe os neurónios anteriores
model.add(Dense(150, activation='relu'))  # relu: um neurónio so é ativado se o valor que recebe for positivo
model.add(Dropout(0.3))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2,activation='softmax'))  # só tem 2 neurónios (0 ou 1);  softmax: recebe o valor de trás e transforma-o em 0 ou 1

model.summary()

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])  # compila o modelo
# adam: gradiente descendente; loss: diz se foi ou não baralhado nas suas decisões

epochs_hist = model.fit(X_train, y_train, epochs = 50,  verbose = 1, validation_split = 0.1)  # executa o modelo

In [None]:
# Validação do modelo

print(epochs_hist.history.keys())

plt.plot(epochs_hist.history['loss'])
plt.plot(epochs_hist.history['val_loss'])

plt.title('Model Loss Progression During Training/Validation')
plt.ylabel('Training and Validation Losses')
plt.xlabel('Epoch Number')
plt.legend(['Training Loss', 'Validation Loss'])
plt.show()  # há overfitting

predicted = model.predict(X_test)

predicted_value = []
test = []
for i in predicted:
    predicted_value.append(np.argmax(i))
    
for i in y_test:
    test.append(np.argmax(i))

print(classification_report(test, predicted_value))

plt.figure(figsize=(10, 10))
cm=confusion_matrix(test, predicted_value)
sns.heatmap(cm, annot=True)
plt.show()