In [None]:
import pandas as pd
from sklearn.utils import shuffle

class DataReader():
  def __init__(self, file_path: str):
    self.file_path = file_path
    self.df = self.load_data()

  def get_file_extension(self) -> str:
    file_extension = self.file_path.split('.')[-1]
    return file_extension

  def load_data(self):
    file_extension = self.get_file_extension()
    if file_extension == 'csv':
      df = pd.read_csv(self.file_path)

    return df

  def train_test_split(self, percent_train: int, shuffle_data=True):
    dataset_length = len(self.df)
    n = int(dataset_length*(percent_train/100))

    if shuffle_data:
      self.df = shuffle(self.df)

    train = self.df[:n]
    test = self.df[n:]

    return train, test

  def drop_nan(self, column_name: str):
    self.df.dropna(subset=[column_name], inplace=True)

  def categorical_to_numerical(self, column_name: str) -> dict:
    """
    drops all NaN values
    convert self.df[column_name] to numerical data

    numerical_key = the key code that corresponds to its respective categorical data
    return dict({numerical_key: categorical_data})
    """
    self.df.dropna(subset=[column_name], inplace=True)

    categorical_dict = dict({})
    for i, data in enumerate(self.df[column_name].unique()):
      categorical_dict[data] = i

    self.df = self.df.replace({column_name: categorical_dict})
    
    return categorical_dict

  def value_counts(self, column_name):
    return self.df[column_name].value_counts()

In [None]:
dr = DataReader('titanic.csv')
dr.df['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [None]:
# column_to_turn_into_categorical_data = 'Embarked'
embarked_cat_code_dict = dr.categorical_to_numerical('Embarked')

len(dr.df)
print(embarked_cat_code_dict)
print(len(dr.df))

{'S': 0, 'C': 1, 'Q': 2}
889


In [None]:
dr.value_counts('Embarked')

0    644
1    168
2     77
Name: Embarked, dtype: int64

In [None]:
train, test = dr.train_test_split(66)

In [None]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
662,663,0,1,"Colley, Mr. Edward Pomeroy",male,47.0,0,0,5727,25.5875,E58,0
510,511,1,3,"Daly, Mr. Eugene Patrick",male,29.0,0,0,382651,7.75,,2
620,621,0,3,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,1
68,69,1,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,0
300,301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,2


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
662,663,0,1,"Colley, Mr. Edward Pomeroy",male,47.0,0,0,5727,25.5875,E58,0
510,511,1,3,"Daly, Mr. Eugene Patrick",male,29.0,0,0,382651,7.75,,2
620,621,0,3,"Yasbeck, Mr. Antoni",male,27.0,1,0,2659,14.4542,,1
68,69,1,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,0
300,301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,2
