In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
from math import log

In [2]:
df = pd.read_csv('names.csv')
df.head()

Unnamed: 0,year,name,percent,sex
0,1880,John,0.081541,boy
1,1880,William,0.080511,boy
2,1880,James,0.050057,boy
3,1880,Charles,0.045167,boy
4,1880,George,0.043292,boy


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258000 entries, 0 to 257999
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   year     258000 non-null  int64  
 1   name     258000 non-null  object 
 2   percent  258000 non-null  float64
 3   sex      258000 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 7.9+ MB


In [4]:
x_train, x_test = train_test_split(df, test_size = 0.3, random_state = 42)

In [12]:
def train(samples):
    classes, freq = defaultdict(lambda:0), defaultdict(lambda:0)
    for feats, label in samples:
        classes[label] += 1                 # count classes frequencies
        for feat in feats:
            freq[label, feat] += 1          # count features frequencies

    for label, feat in freq:                # normalize features frequencies
        freq[label, feat] /= classes[label]
    for c in classes:                       # normalize classes frequencies
        classes[c] /= len(samples)
    print(classes, freq)
    return classes, freq                    # return P(C) and P(O|C)

def classify(classifier, feats):
    classes, prob = classifier
    return min(classes.keys(),              # calculate argmin(-log(P(C|O))) -> argmax(P(C|O))
        key = lambda cl: -log(classes[cl]) + \
            sum(-log(prob.get((cl,feat), 1)) for feat in feats))

def get_features(sample): return (sample[-1]) # get last letter

In [13]:
train_features = [(get_features(row), row['sex']) for index, row in x_train.iterrows()]
classifier = train(train_features)

defaultdict(<function train.<locals>.<lambda> at 0x0000022AA9476310>, {'boy': 0.5001771871539313, 'girl': 0.49982281284606866}) defaultdict(<function train.<locals>.<lambda> at 0x0000022ACDC98D30>, {('boy', 'b'): 1.0, ('boy', 'o'): 1.0, ('boy', 'y'): 1.0, ('girl', 'g'): 1.0, ('girl', 'i'): 1.0, ('girl', 'r'): 1.0, ('girl', 'l'): 1.0})


In [14]:
test_features = [(get_features(row), row['sex']) for index, row in x_test.iterrows()]
predictions = [classify(classifier, [feat]) for feat, _ in test_features]

In [15]:
correct = sum(1 for pred, (_, gender) in zip(predictions, test_features) if pred == gender)
accuracy = correct / len(test_features)
print("Доля правильных ответов классификатора:", accuracy)

Доля правильных ответов классификатора: 0.4995865633074935


In [27]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
from functools import reduce

In [28]:
def get_features(sample):
    return str(sample[0]) + str(sample[-1]) 


def classify(classifier, feats):
    classes, prob = classifier
    probabilities = {cl: classes[cl] * reduce(lambda x, y: x * y, [prob.get((cl, feat), 1) for feat in feats]) for cl in classes.keys()}
    return max(probabilities.keys(), key=lambda cl: probabilities[cl])

In [29]:
train_features2 = [(get_features(row), row['sex']) for index, row in x_train.iterrows()]
classifier2 = train(train_features2)

defaultdict(<function train.<locals>.<lambda> at 0x0000022ACDC94AF0>, {'boy': 0.5001771871539313, 'girl': 0.49982281284606866}) defaultdict(<function train.<locals>.<lambda> at 0x0000022AD3BFAEE0>, {('boy', '2'): 0.2468339016074038, ('boy', '0'): 0.3155803037683213, ('boy', 'b'): 1.0, ('boy', 'o'): 1.0, ('boy', 'y'): 1.0, ('girl', '1'): 1.1090530420525546, ('girl', '9'): 1.0251916515265653, ('girl', '2'): 0.24738556299020695, ('girl', '3'): 0.17814729472238225, ('girl', 'g'): 1.0, ('girl', 'i'): 1.0, ('girl', 'r'): 1.0, ('girl', 'l'): 1.0, ('boy', '1'): 1.1069056369835717, ('boy', '9'): 1.0234911216401719, ('boy', '3'): 0.1789952619226852, ('boy', '5'): 0.179183456582385, ('girl', '7'): 0.1785682633934506, ('boy', '8'): 0.4126223265288049, ('boy', '6'): 0.17948235398308462, ('girl', '8'): 0.41190676651748126, ('girl', '0'): 0.3167013781184916, ('boy', '4'): 0.17826462383208608, ('girl', '4'): 0.17808082598484512, ('girl', '6'): 0.17722781051978553, ('girl', '5'): 0.17773740417423672, (

In [30]:
test_features2 = [(get_features(row), row['sex']) for index, row in x_test.iterrows()]
predictions2 = [classify(classifier2, [feat]) for feat, _ in test_features2]

In [34]:
correct2 = sum(1 for pred, (_, gender) in zip(predictions2, test_features2) if pred == gender)
accuracy2 = correct2 / len(test_features2)
print("Доля правильных ответов классификатора:", accuracy2)

Доля правильных ответов классификатора: 0.4995865633074935


In [37]:
gnb = GaussianNB()
gnb.fit(x_train['name'].str[-1].apply(ord).values.reshape(-1, 1), x_train['sex'])

mnb = MultinomialNB()
mnb.fit(x_train['name'].str[-1].apply(ord).values.reshape(-1, 1), x_train['sex'])

gnb_predictions = gnb.predict(x_test['name'].str[-1].apply(ord).values.reshape(-1, 1))
mnb_predictions = mnb.predict(x_test['name'].str[-1].apply(ord).values.reshape(-1, 1))

gnb_accuracy = accuracy_score(x_test['sex'], gnb_predictions)
mnb_accuracy = accuracy_score(x_test['sex'], mnb_predictions)

print("Точность гауссовского наивного байесовского классификатора:", gnb_accuracy)
print("Точность мультиномиального наивного байесовского классификатора:", mnb_accuracy)

Точность гауссовского наивного байесовского классификатора: 0.7358785529715762
Точность мультиномиального наивного байесовского классификатора: 0.4995865633074935


# Самым точным стал гаусовский наивный классификатор, в то время как остальные были правы лишь в половине случаев

In [43]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_pred = lda.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Confusion Matrix:
[[19  0  0]
 [ 0 13  0]
 [ 0  0 13]]


In [41]:
def LDA_dimensionality(X, y, k):
    '''
    X - набор данных, y - метка, k - целевой размер
    '''
    label_ = list(set(y))

    X_classify = {}

    for label in label_:
        X1 = np.array([X[i] for i in range(len(X)) if y[i] == label])
        X_classify[label] = X1

    mju = np.mean(X, axis=0)
    mju_classify = {}

    for label in label_:
        mju1 = np.mean(X_classify[label], axis=0)
        mju_classify[label] = mju1

    #St = np.dot((X - mju).T, X - mju)

    Sw = np.zeros((len(mju), len(mju)))  # Вычислить матрицу внутриклассовой дивергенции
    for i in label_:
        Sw += np.dot((X_classify[i] - mju_classify[i]).T,
                     X_classify[i] - mju_classify[i])

    # Sb=St-Sw

    Sb = np.zeros((len(mju), len(mju)))  # Вычислить матрицу внутриклассовой дивергенции
    for i in label_:
        Sb += len(X_classify[i]) * np.dot((mju_classify[i] - mju).reshape(
            (len(mju), 1)), (mju_classify[i] - mju).reshape((1, len(mju))))

    eig_vals, eig_vecs = np.linalg.eig(
        np.linalg.inv(Sw).dot(Sb))  # Вычислить собственное значение и собственную матрицу Sw-1 * Sb

    sorted_indices = np.argsort(eig_vals)
    topk_eig_vecs = eig_vecs[:, sorted_indices[:-k - 1:-1]]  # Извлекаем первые k векторов признаков
    return topk_eig_vecs

In [46]:
X_lda = LDA_dimensionality(X_train, y_train, 2)
X_lda

array([[ 0.20115381+0.j, -0.01094741+0.j],
       [ 0.50012931+0.j, -0.72669419+0.j],
       [-0.58704893+0.j,  0.19626006+0.j],
       [-0.60397134+0.j, -0.65823833+0.j]])

In [47]:
from sklearn.datasets import load_wine
from sklearn.model_selection import KFold, LeaveOneOut, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
import numpy as np

wine_data = load_wine()
X = wine_data.data
y = wine_data.target

In [48]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [49]:
loo = LeaveOneOut()

In [50]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [52]:
results = []
for k in range(1, 51):
    knn = KNeighborsClassifier(n_neighbors=k)
    
    X_scaled = scale(X)

    for cv_method, name in zip([kf, loo, skf], ['KFold', 'Leave-One-Out', 'Stratified KFold']):
        scores = cross_val_score(knn, X_scaled, y, cv=cv_method, scoring='accuracy')
        results.append((name, k, scores.mean()))

# Находим лучший результат
best_result = max(results, key=lambda x: x[2])
print("Лучший результат у метода валидации: {}, k: {}, Точность: {:.2f}".format(best_result[0], best_result[1], best_result[2]))

Лучший результат у метода валидации: Leave-One-Out, k: 36, Точность: 0.98
