In [255]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [256]:
data = pd.read_csv('train.csv')
columns = ['Age', 'Gender', 'Pclass', 'Embarked', 'SibSp', 'Parch']
gender_dict = {
    'male': 1,
    'female': 2,
}
embarked_dict = {
    'C': 1,
    'Q': 2,
    'S': 3,
}
age_dict = {
    '0-2': 1, '3-13': 2, '14-24': 3, '25-50': 4, '51+': 5
}

data['Gender'] = data['Gender'].map(gender_dict)
data['Embarked'] = data['Embarked'].map(embarked_dict)
data['Survived'] = data['Survived'].apply(lambda x: float(x))
data['Age'] = data['Age'].apply(lambda x: float(x))
avg_age = data['Age'].mean()
data = data.fillna(avg_age)
data['Age'] = pd.cut(data['Age'], bins=[0, 2, 13, 24, 50, np.inf], labels=['0-2', '3-13', '14-24', '25-50', '51+'])
data['Age'] = data['Age'].map(age_dict)
print(data)

     Survived  Pclass                                               Name  \
0         0.0       3                            Braund, Mr. Owen Harris   
1         1.0       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2         1.0       3                             Heikkinen, Miss. Laina   
3         1.0       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4         0.0       3                           Allen, Mr. William Henry   
..        ...     ...                                                ...   
886       0.0       2                              Montvila, Rev. Juozas   
887       1.0       1                       Graham, Miss. Margaret Edith   
888       0.0       3           Johnston, Miss. Catherine Helen "Carrie"   
889       1.0       1                              Behr, Mr. Karl Howell   
890       0.0       3                                Dooley, Mr. Patrick   

     Gender Age  SibSp  Parch  Embarked  
0         1   3      1      0       3.0  
1  

In [257]:
def calculate_probability(factor)-> float:
    return data[factor].value_counts() / len(data)

def calculate_joint_probability(factor, target):
    grouped_data = data.groupby(factor)[target].value_counts()
    return grouped_data / grouped_data.sum()

def calculate_entropy(probability) -> float:
    return -np.sum(probability * np.log2(probability))

entropy_target = calculate_entropy(calculate_probability('Survived'))
print(entropy_target)
def calculate_mutual_information(factor):
    factor_prob = calculate_probability(factor)
    joint_prob = calculate_joint_probability(factor, 'Survived')
    
    entropy_factor = calculate_entropy(factor_prob)
    entropy_joint = calculate_entropy(joint_prob)
    return entropy_factor + entropy_target - entropy_joint

0.9607079018756469


In [258]:
def print_accuracy(mi_scores): 
    k = 4
    top_factors = [f for f, score in zip(columns, mi_scores) if score >= sorted(mi_scores, reverse=True)[k-1]]
    X = data[top_factors]

    X_train, X_test, y_train, y_test = train_test_split(X, data['Survived'], random_state=42)

    model = LogisticRegression()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

In [259]:
def print_mi(mi_scores, key):
    results = pd.DataFrame({'Factor': columns, 'Mutual Information': mi_scores})
    results = results.sort_values('Mutual Information', ascending=False)
    results.to_csv(f'results-{key}.csv')
    print(results)

In [260]:
mi_scores = [calculate_mutual_information(factor) for factor in columns]
mi_scores
print_mi(mi_scores, 1)
print_accuracy(mi_scores)

     Factor  Mutual Information
1    Gender            0.217660
2    Pclass            0.083831
4     SibSp            0.033466
3  Embarked            0.024047
5     Parch            0.023611
0       Age            0.011279
Accuracy: 0.8071748878923767


In [261]:

mi_scores = mutual_info_classif(
    data[columns], data['Survived'], random_state=42)

print_mi(mi_scores, 2)
print_accuracy(mi_scores)

     Factor  Mutual Information
1    Gender            0.143608
2    Pclass            0.059195
3  Embarked            0.039275
5     Parch            0.033513
4     SibSp            0.023034
0       Age            0.000000
Accuracy: 0.7847533632286996
