In [17]:
import librosa
import mysql.connector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import joblib
import time
import os
from dotenv import load_dotenv

#Veri tabanına bağlanma
load_dotenv()
db = mysql.connector.connect(user=os.getenv("db_user"), password=os.getenv("db_password"), host='localhost', database=os.getenv("db_name"))

#Veri çekme    
cursor = db.cursor()
cursor.execute("select * from kayitlar")
results=cursor.fetchall()

labels=[]
features=[]
segment_duration=20  

#Verilerden kişi isimleri ve ses özellikleri alınır. Veri seti daha küçük parçalara bölünür.    
for sound_file in results:
    audio, sample_rate=librosa.load(sound_file[1])
    segment_samples = segment_duration * sample_rate
    num_segments = len(audio) // segment_samples
    for i in range(num_segments):
        labels.append(sound_file[0])
        start = i * segment_samples
        end = start + segment_samples
        segment = audio[start:end]
        mfcc=librosa.feature.mfcc(y=segment,sr=sample_rate,n_mfcc=30)
        mfcc_mean = np.mean(mfcc.T, axis=0)  
        features.append(mfcc_mean)
        
db.close()

#Kişiler sayısallaştırılır
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
joblib.dump(label_encoder, "label_encoder.pkl")

#Veriler normalize edilir
scaler = StandardScaler()
features_normalized = scaler.fit_transform(features)
joblib.dump(scaler, "scaler.pkl")

# Özellikler ve etiketleri pandas DataFrame'ine dönüştürüyoruz
df = pd.DataFrame(features_normalized)  # Özellikleri DataFrame'e dönüştürüyoruz
df['label'] = encoded_labels  # Etiketleri ekliyoruz

df.to_csv("ses.csv", index=False)

#Daha iyi sonuçlar alabilmek ve sütun sayısını azaltmak için feature selection yapılıyor

import pandas as pd
from sklearn.ensemble import RandomForestClassifier  

df = pd.read_csv('ses.csv')

x=df[["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29"]]
y=df["label"]

model = RandomForestClassifier  ()
model.fit(x, y)

#Özellik önem sıralaması
feature_importances = pd.DataFrame({
    "Feature": x.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(feature_importances)

   Feature  Importance
13      13    0.060901
0        0    0.048938
10      10    0.046661
5        5    0.043935
16      16    0.041746
19      19    0.041655
18      18    0.041370
3        3    0.041154
4        4    0.041112
2        2    0.040747
7        7    0.038787
8        8    0.038257
6        6    0.037440
12      12    0.036893
15      15    0.036501
25      25    0.036093
21      21    0.034688
26      26    0.033686
1        1    0.032918
20      20    0.032571
28      28    0.030853
14      14    0.024418
9        9    0.022576
11      11    0.021838
23      23    0.021014
22      22    0.017971
27      27    0.014974
17      17    0.013873
24      24    0.013820
29      29    0.012608


In [18]:
#Feature selection yapılabilirdi ancak daha net sonuçlar istediğimiz için yapmama kararı aldık.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import joblib 
from sklearn.model_selection import train_test_split

models = [
    ("KNeighborsClassifier", KNeighborsClassifier()),
    ("SVC", SVC()),
    ("DecisionTreeClassifier", DecisionTreeClassifier()),
    ("RandomForestClassifier", RandomForestClassifier()),
    ("GaussianNB", GaussianNB()),
    ("LogisticRegression", LogisticRegression(max_iter=1000)),
    ("MLPClassifier", MLPClassifier(max_iter=1000)),
]

x=df[["0","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29"]]
y=df["label"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

best_accuracy=0
best_time=10

#En iyi model seçiliyor
for name, model in models:
    start_time = time.time()

    model.fit(x_train, y_train)  
    y_pred = model.predict(x_test)
    print(y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    end_Time=time.time()-start_time

    results=[]

    results.append({
        "Algoritma":name,
        'Doğruluk': accuracy,
        'Zaman (s)': end_Time
    })

    if best_accuracy<accuracy:
        joblib.dump(model, "best_model.joblib")
    elif best_accuracy==accuracy and best_time>end_Time:
        joblib.dump(model, "best_model.joblib")
    
    print(results)

[10 12  0 12  1 13  3 12  5 12  6  2  4  3  4  4]
[{'Algoritma': 'KNeighborsClassifier', 'Doğruluk': 0.625, 'Zaman (s)': 0.020940065383911133}]
[10 11  0 12  1 13  3  2  5 12  6  2 13  3  4  7]
[{'Algoritma': 'SVC', 'Doğruluk': 0.875, 'Zaman (s)': 0.006727457046508789}]
[10 11  7 12  1  8  3  2  5  1  6 11  8  3 12  9]
[{'Algoritma': 'DecisionTreeClassifier', 'Doğruluk': 0.5, 'Zaman (s)': 0.0028345584869384766}]
[10 11  0 11  1  4  3  2  5 12  6  2 13  3 12  7]
[{'Algoritma': 'RandomForestClassifier', 'Doğruluk': 0.875, 'Zaman (s)': 0.17548465728759766}]
[10  1  0  1  1  9  3  1  5 12  6  8  9  3  1  1]
[{'Algoritma': 'GaussianNB', 'Doğruluk': 0.5, 'Zaman (s)': 0.008117914199829102}]
[10 11  0 11  1 13  3  2  5 12  6  2 13  3 12  7]
[{'Algoritma': 'LogisticRegression', 'Doğruluk': 0.9375, 'Zaman (s)': 0.01640915870666504}]
[10 11  0 11  1 13  3  2  5 12  6  2 13  3 12  7]
[{'Algoritma': 'MLPClassifier', 'Doğruluk': 0.9375, 'Zaman (s)': 0.18759822845458984}]
