In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, accuracy_score

In [3]:
def read_data(path, sep):
    data = pd.read_csv(path, sep=sep)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    return data

In [4]:
bag = read_data('dados/sub_rock_bag.csv', sep=",")
cet = read_data('dados/sub_rock_cet.csv', sep=",")

In [8]:
cet

Unnamed: 0,Characters,Words,AvgWordLen,UniqueWords,Sentences,AvgWordsSentence,Syllables,AvgSyllableWords,RareWordsRatio,LexicalDiversity,label
0,997,198,4.186869,43,44,4.500000,225,1.136364,0.000000,0.217172,Punk Rock
1,1634,351,3.652422,145,59,5.949153,368,1.048433,0.227920,0.413105,Heavy Metal
2,643,135,3.829630,79,17,7.941176,157,1.162963,0.392593,0.585185,Soft Rock
3,592,133,3.593985,62,22,6.045455,145,1.090226,0.195489,0.466165,Heavy Metal
4,1255,276,3.710145,99,45,6.133333,298,1.079710,0.152174,0.358696,Punk Rock
...,...,...,...,...,...,...,...,...,...,...,...
2600,947,227,3.348018,93,40,5.675000,230,1.013216,0.251101,0.409692,Heavy Metal
2601,1325,257,4.245136,120,40,6.425000,315,1.225681,0.268482,0.466926,Punk Rock
2602,850,182,3.879121,71,38,4.789474,205,1.126374,0.164835,0.390110,Soft Rock
2603,780,171,3.690058,92,22,7.772727,186,1.087719,0.368421,0.538012,Heavy Metal


In [5]:
df = cet.copy()

In [6]:
def amputacao(df, taxa):

    dimensionalidade = (len(df)*(len(df.columns)-1))
    while(df.isna().sum().sum()/(len(df)*len(df.columns[:-1]))<taxa):
        df.iloc[np.random.randint(len(df)),np.random.randint((len(df.columns)-1))] = np.nan
    return df

In [7]:
df_amp = amputacao(df, 0.2)

In [8]:
df_amp.isna().sum()

Characters          522
Words               545
AvgWordLen          520
UniqueWords         524
Sentences           528
AvgWordsSentence    534
Syllables           480
AvgSyllableWords    542
RareWordsRatio      508
LexicalDiversity    507
label                 0
dtype: int64

Abordagens de imputação

Simple Imputer

In [23]:
from sklearn.impute import SimpleImputer

df = df_amp
X = df.iloc[:, :-1] 
y = df['label'].values

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
transformed_values = imputer.fit_transform(X)
df_imp = pd.DataFrame(transformed_values, columns=df.columns[:-1])
df_imp['label'] = y

KNNImputer

In [57]:
from sklearn.impute import KNNImputer

X = df.iloc[:, :-1] 
y = df['label'].values

imputer = KNNImputer(n_neighbors=1000)
df_imp2 = pd.DataFrame(imputer.fit_transform(X),columns = df.columns[:-1])

In [11]:
df_imp2['label'] = y

Obtendo a localização dos elementos NaN

In [12]:
list_loc = []
for i in range(df.shape[0]):
    for j in range(df.shape[1]):
        if df.isna().iat[i,j] == True:
            list_loc.append((i,j))

Obtendo os valores do dataframe original

In [13]:
def get_values(list_loc, data_frame):
    list_values = []
    for x in list_loc:
        list_values.append(data_frame.iloc[x[0],x[1]])
    return list_values

In [58]:
list_val = get_values(list_loc, cet)
list_imp1 = get_values(list_loc, df_imp)
list_imp2 = get_values(list_loc, df_imp2)

In [15]:
from sklearn.metrics import mean_squared_error as mse

In [25]:
print(mse(list_val, list_imp1))

138325.0855212449


In [59]:
print(mse(list_val, list_imp2))

7605.528797662619


In [26]:
df_imp.describe()

Unnamed: 0,Characters,Words,AvgWordLen,UniqueWords,Sentences,AvgWordsSentence,Syllables,AvgSyllableWords,RareWordsRatio,LexicalDiversity
count,2605.0,2605.0,2605.0,2605.0,2605.0,2605.0,2605.0,2605.0,2605.0,2605.0
mean,834.971209,172.016123,3.149025,75.918618,29.314012,5.243879,204.319002,0.914035,0.228728,0.369543
std,573.142657,118.406754,1.602301,50.803727,21.153135,8.031084,133.712866,0.474266,0.167504,0.21583
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,548.0,114.0,3.47619,52.0,18.0,4.210526,139.0,1.05042,0.089888,0.268097
50%,852.0,176.0,3.808383,81.0,29.0,5.590909,207.0,1.118182,0.227488,0.410029
75%,1170.0,242.0,4.068063,103.0,41.0,6.791667,283.0,1.176471,0.349206,0.525253
max,6010.0,1013.0,5.894444,411.0,172.0,289.0,1546.0,1.648438,0.717949,0.815789
