In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif


In [94]:
data_frame = pd.read_csv('dataset.csv')
data_frame['gender'] = data_frame['gender'].replace({'M': 1, 'F': 0})
data_frame['gdp_country'] = data_frame['gdp_country'].str.replace('[$,]', '', regex=True).astype(float)


data_frame = data_frame.drop('city', axis=1)
data_frame = data_frame.drop('industries', axis=1)
data_frame = data_frame.drop('category', axis=1)
data_frame = data_frame.drop('personName', axis=1)
data_frame = data_frame.drop('source', axis=1)
data_frame = data_frame.drop('countryOfCitizenship', axis=1)
data_frame = data_frame.drop('organization', axis=1)
# data_frame = data_frame.drop('selfMade', axis=1)
data_frame = data_frame.drop('status', axis=1)
data_frame = data_frame.drop('lastName', axis=1)
data_frame = data_frame.drop('firstName', axis=1)
data_frame = data_frame.drop('title', axis=1)
data_frame = data_frame.drop('state', axis=1)
data_frame = data_frame.drop('residenceStateRegion', axis=1)
# data_frame = data_frame.drop('gender', axis=1)
data_frame = data_frame.drop('birthDate', axis=1)
data_frame = data_frame.drop('date', axis=1)
# data_frame = data_frame.drop('gdp_country', axis=1)
print(data_frame)

      rank  finalWorth   age        country  selfMade  gender  birthYear  \
0        1      211000  74.0         France     False       1     1949.0   
1        2      180000  51.0  United States      True       1     1971.0   
2        3      114000  59.0  United States      True       1     1964.0   
3        4      107000  78.0  United States      True       1     1944.0   
4        5      106000  92.0  United States      True       1     1930.0   
...    ...         ...   ...            ...       ...     ...        ...   
2635  2540        1000  51.0          China      True       1     1971.0   
2636  2540        1000  80.0  United States     False       1     1943.0   
2637  2540        1000  60.0          China      True       1     1962.0   
2638  2540        1000  71.0          China      True       1     1951.0   
2639  2540        1000  66.0    Philippines     False       1     1956.0   

      birthMonth  birthDay  cpi_country  cpi_change_country   gdp_country  \
0         

In [95]:
data_frame['class_label'] = data_frame['country'].apply(lambda x: 'US' if x == 'United States' else 'Non_US')


In [96]:
X = data_frame.drop(['country', 'class_label'], axis=1)
y = data_frame['class_label']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)

      rank  finalWorth   age  selfMade  gender  birthYear  birthMonth  \
2387  2259        1200  77.0     False       1     1945.0         5.0   
1187  1164        2600  40.0     False       1     1983.0         4.0   
2396  2259        1200  46.0      True       0     1977.0         1.0   
440    437        5900  66.0     False       1     1957.0         1.0   
508    497        5300  77.0      True       1     1945.0         6.0   
...    ...         ...   ...       ...     ...        ...         ...   
1638  1575        1900  67.0      True       1     1956.0         1.0   
1095  1067        2800  41.0      True       1     1982.0         1.0   
1130  1104        2700  37.0     False       1     1985.0         5.0   
1294  1272        2400  63.0     False       1     1959.0         9.0   
860    852        3400  84.0      True       1     1938.0         6.0   

      birthDay  cpi_country  cpi_change_country   gdp_country  \
2387       3.0          NaN                 NaN           

In [98]:
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean') 

imputer.fit(X_train)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)


In [99]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

In [100]:
def rank_features_odds(logistic_model, feature_names):
    feature_coef = pd.DataFrame({'Feature': feature_names, 'Coefficient': logistic_model.coef_[0]})
    feature_coef['Abs_Coefficient'] = feature_coef['Coefficient'].abs()
    feature_coef = feature_coef.sort_values(by='Abs_Coefficient', ascending=False)
    return feature_coef
def rank_features_mutual_info(X, y, feature_names):
    mi = SelectKBest(score_func=mutual_info_classif, k='all')
    mi.fit(X, y)
    feature_mi = pd.DataFrame({'Feature': feature_names, 'MI Score': mi.scores_})
    feature_mi = feature_mi.sort_values(by='MI Score', ascending=False)
    return feature_mi

In [101]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean') 
imputer.fit(X)
X_imputed = imputer.transform(X)

feature_names = X.columns
feature_ranking_odds = rank_features_odds(logistic_model, feature_names)

feature_ranking_mi = rank_features_mutual_info(X_imputed, y, feature_names)

print("Ranking of Features based on Odds function:")
print(feature_ranking_odds[['Feature', 'Coefficient']])

print("Ranking of Features based on Mutual Information:")
print(feature_ranking_mi)


Ranking of Features based on Odds function:
                                       Feature   Coefficient
16                          population_country -3.833206e-07
10                                 gdp_country  6.118955e-12
1                                   finalWorth -1.435855e-12
5                                    birthYear -1.016582e-12
0                                         rank -7.931755e-13
18                           longitude_country -7.386146e-14
8                                  cpi_country -7.237683e-14
12  gross_primary_education_enrollment_country -5.398138e-14
13                     life_expectancy_country -4.031806e-14
2                                          age -3.256677e-14
15                      total_tax_rate_country -2.747439e-14
11         gross_tertiary_education_enrollment -2.198877e-14
17                            latitude_country -1.694076e-14
14                 tax_revenue_country_country -8.346209e-15
7                                     bir

In [102]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 95.08%
