# Pipelines using Pandas

In [1]:
import pandas as pd

candidates= {
    'Full_Name':["Aida, Kone","Mamadou, Diop","Ismael, Camara","Aicha, Konate",
                 "Fanta, Koumare", "Khalil, Cisse"],
    'Degree':['Master','Master','Bachelor', "PhD", "Master", "PhD"],
    'From':["Abidjan","Dakar","Bamako", "Abidjan","Konakry", "Lomé"],
    'Application_date': ['11/17/2022', '09/23/2022', '12/2/2021', 
                         '08/25/2022', '01/07/2022', '12/26/2022'],
    'From_office (min)': [120,95,75, 80,100, 34]
          }
candidates_df = pd.DataFrame(candidates)
candidates_df['Application_date'] = pd.to_datetime(candidates_df["Application_date"])

candidates_df

Unnamed: 0,Full_Name,Degree,From,Application_date,From_office (min)
0,"Aida, Kone",Master,Abidjan,2022-11-17,120
1,"Mamadou, Diop",Master,Dakar,2022-09-23,95
2,"Ismael, Camara",Bachelor,Bamako,2021-12-02,75
3,"Aicha, Konate",PhD,Abidjan,2022-08-25,80
4,"Fanta, Koumare",Master,Konakry,2022-01-07,100
5,"Khalil, Cisse",PhD,Lomé,2022-12-26,34


**Task 1**

In [2]:
def get_first_last_name(df, col_name):

  final_df = df.copy()

  splited_full_name = final_df[col_name].str.split(",", expand=True)

  final_df["First_Name"] = splited_full_name.get(0)
  final_df["Last_Name"] = splited_full_name.get(1)

  return final_df

result_task1 = get_first_last_name(candidates_df, "Full_Name")
result_task1

Unnamed: 0,Full_Name,Degree,From,Application_date,From_office (min),First_Name,Last_Name
0,"Aida, Kone",Master,Abidjan,2022-11-17,120,Aida,Kone
1,"Mamadou, Diop",Master,Dakar,2022-09-23,95,Mamadou,Diop
2,"Ismael, Camara",Bachelor,Bamako,2021-12-02,75,Ismael,Camara
3,"Aicha, Konate",PhD,Abidjan,2022-08-25,80,Aicha,Konate
4,"Fanta, Koumare",Master,Konakry,2022-01-07,100,Fanta,Koumare
5,"Khalil, Cisse",PhD,Lomé,2022-12-26,34,Khalil,Cisse


In [3]:
#candidates_df["Full_Name"].str.split(",", expand=True)

**Task 2**

In [4]:
def get_application_date_info(df, column_name):

  application_date = df[column_name]

  final_df = df.copy()

  final_df["Day"] = application_date.dt.day 
  final_df["Month"] = application_date.dt.month 
  final_df["Year"] = application_date.dt.year 
  final_df["Day_of_week"] = application_date.dt.day_name()
  final_df["Month_of_year"] = application_date.dt.month_name()

  return final_df

result_task2 = get_application_date_info(candidates_df, "Application_date")
result_task2

Unnamed: 0,Full_Name,Degree,From,Application_date,From_office (min),Day,Month,Year,Day_of_week,Month_of_year
0,"Aida, Kone",Master,Abidjan,2022-11-17,120,17,11,2022,Thursday,November
1,"Mamadou, Diop",Master,Dakar,2022-09-23,95,23,9,2022,Friday,September
2,"Ismael, Camara",Bachelor,Bamako,2021-12-02,75,2,12,2021,Thursday,December
3,"Aicha, Konate",PhD,Abidjan,2022-08-25,80,25,8,2022,Thursday,August
4,"Fanta, Koumare",Master,Konakry,2022-01-07,100,7,1,2022,Friday,January
5,"Khalil, Cisse",PhD,Lomé,2022-12-26,34,26,12,2022,Monday,December


**Task 3**

In [5]:
def info_by_row(row):

  # Select columns of interest 
  full_name = row.Full_Name.replace(",", " ")
  is_from = row.From
  degree = row.Degree
  from_office = row["From_office (min)"]

  # Generate the description from previous variables
  info = f"""{full_name} from {is_from} holds a {degree} degree 
              and lives {from_office} from the office"""

  return info

# Create the info
def candidate_info(df):

  final_df = df.copy()

  final_df["Info"] = final_df.apply(lambda row: info_by_row(row), axis=1)

  return final_df

# Apply the function for Task 3
result_task3 = candidate_info(candidates_df)
result_task3

Unnamed: 0,Full_Name,Degree,From,Application_date,From_office (min),Info
0,"Aida, Kone",Master,Abidjan,2022-11-17,120,Aida Kone from Abidjan holds a Master degree ...
1,"Mamadou, Diop",Master,Dakar,2022-09-23,95,Mamadou Diop from Dakar holds a Master degree...
2,"Ismael, Camara",Bachelor,Bamako,2021-12-02,75,Ismael Camara from Bamako holds a Bachelor de...
3,"Aicha, Konate",PhD,Abidjan,2022-08-25,80,Aicha Konate from Abidjan holds a PhD degree ...
4,"Fanta, Koumare",Master,Konakry,2022-01-07,100,Fanta Koumare from Konakry holds a Master deg...
5,"Khalil, Cisse",PhD,Lomé,2022-12-26,34,Khalil Cisse from Lomé holds a PhD degree \n ...


**Use of Pipe**

In [6]:
# Create the pipe by using calling all the functions. 
preprocessed_candidates = (candidates_df.
                            pipe(get_first_last_name, "Full_Name").
                            pipe(get_application_date_info, "Application_date").
                            pipe(candidate_info)
                          )

# Show the final result
preprocessed_candidates

Unnamed: 0,Full_Name,Degree,From,Application_date,From_office (min),First_Name,Last_Name,Day,Month,Year,Day_of_week,Month_of_year,Info
0,"Aida, Kone",Master,Abidjan,2022-11-17,120,Aida,Kone,17,11,2022,Thursday,November,Aida Kone from Abidjan holds a Master degree ...
1,"Mamadou, Diop",Master,Dakar,2022-09-23,95,Mamadou,Diop,23,9,2022,Friday,September,Mamadou Diop from Dakar holds a Master degree...
2,"Ismael, Camara",Bachelor,Bamako,2021-12-02,75,Ismael,Camara,2,12,2021,Thursday,December,Ismael Camara from Bamako holds a Bachelor de...
3,"Aicha, Konate",PhD,Abidjan,2022-08-25,80,Aicha,Konate,25,8,2022,Thursday,August,Aicha Konate from Abidjan holds a PhD degree ...
4,"Fanta, Koumare",Master,Konakry,2022-01-07,100,Fanta,Koumare,7,1,2022,Friday,January,Fanta Koumare from Konakry holds a Master deg...
5,"Khalil, Cisse",PhD,Lomé,2022-12-26,34,Khalil,Cisse,26,12,2022,Monday,December,Khalil Cisse from Lomé holds a PhD degree \n ...


# Cross Validation

In [38]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [14]:
iris = load_iris()

**Logistic Regression**

In [37]:
l_scores = cross_val_score(LogisticRegression(), iris.data, iris.target, cv=5)
l_scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [31]:
np.average(l_scores)

0.9733285917496444

**Supported Vector Machines**

In [35]:
s_scores = cross_val_score(SVC(), iris.data, iris.target, cv=5)
s_scores

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [36]:
np.average(s_scores)

0.9666666666666666

**Riesgos de no usar CV**

In [40]:
from sklearn.datasets import load_digits
digits = load_digits()

**CAMBIAR EL RANDOM STATE!!** , random_state=124

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =  train_test_split(digits.data, digits.target,
                                                    test_size=0.3)

In [89]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9611111111111111

**K-Fold CV**

In [94]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [95]:
# Representación visual
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):
    print(train_index, test_index)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [105]:
# Corrida de modelo simple
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [99]:
get_score(LogisticRegression(), X_train, X_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9611111111111111

In [101]:
# Stratified kFold
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=5)

In [117]:
scores_lr = []
scores_rf = []

for train_index, test_index in kf.split(digits.data):
    X_train, X_test, y_train, y_test = digits.data[train_index], \
                                        digits.data[test_index], \
                                        digits.target[train_index], \
                                        digits.target[test_index]
    
    scores_lr.append(get_score(LogisticRegression(), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(), X_train, X_test, y_train, y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [122]:
print("scores_lr: ", scores_lr)
print("AVG: ", np.mean(scores_lr))

scores_lr:  [0.9277777777777778, 0.8666666666666667, 0.9387186629526463, 0.935933147632312, 0.9080779944289693]
AVG:  0.9154348498916743


In [123]:
print("scores_rf", scores_rf)
print("AVG: ", np.mean(scores_rf))

scores_rf [0.9305555555555556, 0.9194444444444444, 0.9637883008356546, 0.9637883008356546, 0.9275766016713092]
AVG:  0.9410306406685237


**Lo mismo, pero usando la versión de sklearn**

In [125]:
from sklearn.model_selection import cross_val_score

In [126]:
cross_val_score(LogisticRegression(), digits.data, digits.target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.92222222, 0.86944444, 0.94150418, 0.93871866, 0.89693593])

In [137]:
# Comparando diferentes parámetros
print("RF #1: ", cross_val_score(RandomForestClassifier(n_estimators=5), digits.data, digits.target).mean())
print("RF #2: ", cross_val_score(RandomForestClassifier(n_estimators=3), digits.data, digits.target).mean())

RF #1:  0.8436737852058187
RF #2:  0.7930269266480966
