In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
data = pd.read_csv('data/row.csv', sep=',')

In [3]:
# Encode categorical features
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

### Remove higly correlated features

In [4]:
# Find pairs of highly correlated features and their correlation values
correlation_matrix = data.corr()
threshold = 0.9  # Adjust this threshold as needed to remove highly correlated features

high_corr_pairs = [(col1, col2, correlation_matrix.loc[col1, col2]) for col1 in correlation_matrix.columns for col2 in correlation_matrix.columns 
                   if col1 != col2 and abs(correlation_matrix.loc[col1, col2]) > threshold]

# Create a set to hold the columns to drop
cols_to_drop = set()
for col1, col2, corr_value in high_corr_pairs:
    if col1 not in cols_to_drop and col2 not in cols_to_drop:
        cols_to_drop.add(col2)
        print(f"Removing feature '{col2}' with correlation value {corr_value} with feature '{col1}'")

# Drop the highly correlated features
data = data.drop(columns=cols_to_drop)

Removing feature 'median' with correlation value 0.9254453730463191 with feature 'meanfreq'
Removing feature 'Q25' with correlation value 0.9114163463244435 with feature 'meanfreq'
Removing feature 'centroid' with correlation value 1.0 with feature 'meanfreq'
Removing feature 'kurt' with correlation value 0.9770204562201018 with feature 'skew'
Removing feature 'dfrange' with correlation value 0.9998384146229784 with feature 'maxdom'


### Remove outliers

In [5]:
columns_with_outliers = ['maxfun', 'mindom', 'skew']
# data['skew'] = np.log(data['skew'])
mindom_mean = data['mindom'].median()
skew_mean = data['skew'].median()
minfun_mean = data['minfun'].median()
maxfun_mean = data['maxfun'].median()
data['skew'] = data['skew'].apply(lambda x: x if x < 6 else skew_mean)
data['mindom'] = data['mindom'].apply(lambda x: x if x <= 0.075 else mindom_mean)
data['minfun'] = data['minfun'].apply(lambda x: x if x <= 0.075 else minfun_mean)
data['maxfun'] = data['maxfun'].apply(lambda x: x if x >= 0.2 else maxfun_mean)

### Feature Scaling

In [6]:
scaler = MinMaxScaler()
X = data.drop(columns=['label'])
y = data['label']
X = scaler.fit_transform(X)

In [7]:
data.to_csv('data/voice.csv', index=False)