In [1]:
from pandas import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('./dataset/names.csv')
df = df.drop_duplicates(subset="Name")
df.head()

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065.0
1,2,Anna,1880,F,2604.0
2,3,Emma,1880,F,2003.0
3,4,Elizabeth,1880,F,1939.0
4,5,Minnie,1880,F,1746.0


In [3]:
def checkVowelEnd(name):
    if name[-1] in "aeiou":
        return "Vowel End"
    return "Consonant End"

In [4]:
df["Vowel/Consonant End"] = df["Name"].apply(checkVowelEnd)
df.head()

Unnamed: 0,Id,Name,Year,Gender,Count,Vowel/Consonant End
0,1,Mary,1880,F,7065.0,Consonant End
1,2,Anna,1880,F,2604.0,Vowel End
2,3,Emma,1880,F,2003.0,Vowel End
3,4,Elizabeth,1880,F,1939.0,Consonant End
4,5,Minnie,1880,F,1746.0,Vowel End


In [5]:
def checkGender(gender):
    if gender == "F":
        return 0
    return 1
    
df["Gender Value"] = df["Gender"].apply(checkGender)

df.head()

Unnamed: 0,Id,Name,Year,Gender,Count,Vowel/Consonant End,Gender Value
0,1,Mary,1880,F,7065.0,Consonant End,0
1,2,Anna,1880,F,2604.0,Vowel End,0
2,3,Emma,1880,F,2003.0,Vowel End,0
3,4,Elizabeth,1880,F,1939.0,Consonant End,0
4,5,Minnie,1880,F,1746.0,Vowel End,0


In [6]:
def compare(group):
    return df.groupby([group])["Gender Value"].sum()*100/df.groupby([group])["Gender Value"].count()
df.groupby(["Vowel/Consonant End"])["Gender Value"].sum()
df.groupby(["Vowel/Consonant End"])['Gender Value'].count()


Vowel/Consonant End
Consonant End     9341
Vowel End        11914
Name: Gender Value, dtype: int64

In [7]:
female_names = sum(df.groupby(["Vowel/Consonant End"])["Gender Value"].sum())
all_names = df.groupby(["Gender"])["Gender Value"].count()
print (all_names)

Gender
F    12734
M     8521
Name: Gender Value, dtype: int64


In [8]:
def vowelConsonantStart(name):
    if name[0] in "aeiou":
        return "Vowel Start"
    return "Consonant Start"

df["Vowel/Consonant Start"] = df["Name"].apply(vowelConsonantStart)

print("\n Comparison => %s", compare("Vowel/Consonant Start"))

df.head()


 Comparison => %s Vowel/Consonant Start
Consonant Start    40.089391
Name: Gender Value, dtype: float64


Unnamed: 0,Id,Name,Year,Gender,Count,Vowel/Consonant End,Gender Value,Vowel/Consonant Start
0,1,Mary,1880,F,7065.0,Consonant End,0,Consonant Start
1,2,Anna,1880,F,2604.0,Vowel End,0,Consonant Start
2,3,Emma,1880,F,2003.0,Vowel End,0,Consonant Start
3,4,Elizabeth,1880,F,1939.0,Consonant End,0,Consonant Start
4,5,Minnie,1880,F,1746.0,Vowel End,0,Consonant Start


In [9]:
def shortLongName(name):
    if len(name) < 7:
        return "Short"
    return "Long"

df["Short/Long Name"] = df["Name"].apply(shortLongName)
print(compare("Short/Long Name"))
df.head()

Short/Long Name
Long     32.909653
Short    45.096630
Name: Gender Value, dtype: float64


Unnamed: 0,Id,Name,Year,Gender,Count,Vowel/Consonant End,Gender Value,Vowel/Consonant Start,Short/Long Name
0,1,Mary,1880,F,7065.0,Consonant End,0,Consonant Start,Short
1,2,Anna,1880,F,2604.0,Vowel End,0,Consonant Start,Short
2,3,Emma,1880,F,2003.0,Vowel End,0,Consonant Start,Short
3,4,Elizabeth,1880,F,1939.0,Consonant End,0,Consonant Start,Long
4,5,Minnie,1880,F,1746.0,Vowel End,0,Consonant Start,Short


In [10]:
training_data = df[["Gender Value", "Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]]
training_data.head()

Unnamed: 0,Gender Value,Vowel/Consonant End,Short/Long Name,Vowel/Consonant Start
0,0,Consonant End,Short,Consonant Start
1,0,Vowel End,Short,Consonant Start
2,0,Vowel End,Short,Consonant Start
3,0,Consonant End,Long,Consonant Start
4,0,Vowel End,Short,Consonant Start


In [11]:
def reprCategory(column):
    column = column.astype("category")
    return column.cat.codes
    
training_data[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]] = training_data[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]].apply(reprCategory)

training_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Gender Value,Vowel/Consonant End,Short/Long Name,Vowel/Consonant Start
0,0,0,1,0
1,0,1,1,0
2,0,1,1,0
3,0,0,0,0
4,0,1,1,0


In [12]:
train, test = train_test_split(training_data, test_size = 0.20)

In [13]:
clf = DecisionTreeClassifier()
clf = clf.fit(train[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]], train["Gender Value"])

In [14]:
clf
clf.feature_importances_


array([0.96469653, 0.03530347, 0.        ])

In [15]:
predictions = clf.predict(test[["Vowel/Consonant End", "Short/Long Name", "Vowel/Consonant Start"]])
accuracy_score(test["Gender Value"], predictions)

0.7539402493530933