In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msnofrom
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler,RobustScaler

In [2]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.float_format",lambda x: '%.3f' % x)
pd.set_option("display.width", 500)

In [3]:
def load():
    data=pd.read_csv("titanic.csv")
    return data
def load_application_train():
    data=pd.read_csv("application_train.csv")
    return data
def grab_col_names(dataframe,cat_th=10,car_th=20):
    cat_cols=[col for col in dataframe.columns if dataframe[col].dtypes=="O"] #tipi obcejt olanları al
    num_but_cat=[col for col in dataframe.columns if dataframe[col].nunique()<cat_th and
             dataframe[col].dtypes!="O"] #numerik görünen kategörikleri aldık
    cat_but_car=[col for col in dataframe.columns if dataframe[col].nunique()>car_th and
                dataframe[col].dtypes=="O"] #kategorik görünen kardinalleri aldık.
    
    cat_cols=cat_cols+num_but_cat #kategörikler güncellendi
    cat_cols=[col for col in cat_cols if col not in cat_but_car] #cat_cols içinde kardinalleri ayırdık
    
    num_cols=[col for col in dataframe.columns if dataframe[col].dtypes!="O"] #tipi objectden farkı olanları getir
    num_cols=[col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations(Gözlem):{dataframe.shape[0]}")
    print(f"Variables(Öznitelik): {dataframe.shape[1]}")
    print(f"cat_cols:{len(cat_cols)}")
    print(f"num_cols:{len(num_cols)} ")
    print(f"cat_but_car:{len(cat_but_car)} ")
    print(f"num_but_cat:{len(num_but_cat)} ")
    
    return cat_cols,num_cols,cat_but_car

### Var olan değişkenlerden yeni değişlenler türetme

In [5]:
df=load()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Binary Features

In [6]:
df["New_Cabin_Bool"]=df["Cabin"].notnull().astype("int")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New_Cabin_Bool
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
df.groupby("New_Cabin_Bool").agg({"Survived":"mean"})
#bir ilişki bulduk mu???

Unnamed: 0_level_0,Survived
New_Cabin_Bool,Unnamed: 1_level_1
0,0.3
1,0.667


In [9]:
from statsmodels.stats.proportion import proportions_ztest

test_stat, pvalue = proportions_ztest(count=[df.loc[df["New_Cabin_Bool"] == 1, "Survived"].sum(),
                                             df.loc[df["New_Cabin_Bool"] == 0, "Survived"].sum()],

                                      nobs=[df.loc[df["New_Cabin_Bool"] == 1, "Survived"].shape[0],
                                            df.loc[df["New_Cabin_Bool"] == 0, "Survived"].shape[0]])

In [10]:
print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))
##aralarında istatisliksel olarak anlam vardır.

Test Stat = 9.4597, p-value = 0.0000


In [11]:
df.loc[((df['SibSp'] + df['Parch']) > 0), "NEW_IS_ALONE"] = "NO"
df.loc[((df['SibSp'] + df['Parch']) == 0), "NEW_IS_ALONE"] = "YES"
#akrabalık ilişkileri ile yeni bir değişken oluşturduk

In [14]:
df.groupby("NEW_IS_ALONE").agg({"Survived":"mean"})

Unnamed: 0_level_0,Survived
NEW_IS_ALONE,Unnamed: 1_level_1
NO,0.506
YES,0.304


In [15]:
#Hiporez testi yapalım
test_stat, pvalue = proportions_ztest(count=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].sum(),
                                             df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].sum()],

                                      nobs=[df.loc[df["NEW_IS_ALONE"] == "YES", "Survived"].shape[0],
                                            df.loc[df["NEW_IS_ALONE"] == "NO", "Survived"].shape[0]])

print('Test Stat = %.4f, p-value = %.4f' % (test_stat, pvalue))

Test Stat = -6.0704, p-value = 0.0000


# Text Features

In [16]:
df=load()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
#Letter Count Harfleri Saydırma
df["New_Name_Count"]=df["Name"].str.len()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New_Name_Count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24


In [18]:
#Word Count Kelimeleri Saydırma
df["New_Name_Word_Count"]=df["Name"].apply(lambda x:len(str(x).split(" ")))
# x i string e çevir, boşluklara göre split et sonra kaç split sayısını say
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New_Name_Count,New_Name_Word_Count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51,7
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4


In [25]:
##ÖZel yapıları yakalamak
df['New_Name_Dr']=df["Name"].apply(lambda x:len([x for x in x.split() if x.startswith("Dr")]))
df[df["New_Name_Dr"]==1]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New_Name_Count,New_Name_Word_Count,New_Name_Dr
130,131,0,3,"Drazenoic, Mr. Jozef",male,33.0,0,0,349241,7.896,,C,20,3,1
245,246,0,1,"Minahan, Dr. William Edward",male,44.0,2,0,19928,90.0,C78,Q,27,4,1
317,318,0,2,"Moraweck, Dr. Ernest",male,54.0,0,0,29011,14.0,,S,20,3,1
398,399,0,2,"Pain, Dr. Alfred",male,23.0,0,0,244278,10.5,,S,16,3,1
416,417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34.0,1,1,28220,32.5,,S,47,7,1
632,633,1,1,"Stahelin-Maeglin, Dr. Max",male,32.0,0,0,13214,30.5,B50,C,25,3,1
660,661,1,1,"Frauenthal, Dr. Henry William",male,50.0,2,0,PC 17611,133.65,,S,29,4,1
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.329,B51 B53 B55,C,34,5,1
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C,25,4,1
796,797,1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.929,D17,S,27,4,1


In [27]:
df.groupby("New_Name_Dr").agg({"Survived":["mean","count"]})
#geçerli bir veri mi sizce

Unnamed: 0_level_0,Survived,Survived
Unnamed: 0_level_1,mean,count
New_Name_Dr,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.383,881
1,0.5,10


## Regex Feature
İsimlerdeki ünvanları çekmek

In [29]:
df['NEW_TITLE'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,New_Name_Count,New_Name_Word_Count,New_Name_Dr,NEW_TITLE
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,51,7,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4,0,Mr


In [30]:
df[["NEW_TITLE","Survived","Age"]].groupby(["NEW_TITLE"]).agg({"Survived":"mean","Age":["count","mean"]})
#kategorik değişken kırılımı yakaladık. Boş olan yaş değişkenlerini doldururken bunu baza alırsak çok iyi olur

Unnamed: 0_level_0,Survived,Age,Age
Unnamed: 0_level_1,mean,count,mean
NEW_TITLE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Capt,0.0,1,70.0
Col,0.5,2,58.0
Countess,1.0,1,33.0
Don,0.0,1,40.0
Dr,0.429,6,42.0
Jonkheer,0.0,1,38.0
Lady,1.0,1,48.0
Major,0.5,2,48.5
Master,0.575,36,4.574
Miss,0.698,146,21.774


## Date değişkenleri üzerine

In [32]:
dff = pd.read_csv("course_reviews.csv")
print(dff.head())
dff.info()

   Rating            Timestamp             Enrolled  Progress  Questions Asked  Questions Answered
0   5.000  2021-02-05 07:45:55  2021-01-25 15:12:08     5.000            0.000               0.000
1   5.000  2021-02-04 21:05:32  2021-02-04 20:43:40     1.000            0.000               0.000
2   4.500  2021-02-04 20:34:03  2019-07-04 23:23:27     1.000            0.000               0.000
3   5.000  2021-02-04 16:56:28  2021-02-04 14:41:29    10.000            0.000               0.000
4   4.000  2021-02-04 15:00:24  2020-10-13 03:10:07    10.000            0.000               0.000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rating              4323 non-null   float64
 1   Timestamp           4323 non-null   object 
 2   Enrolled            4323 non-null   object 
 3   Progress            4323 non-null   float64
 4   Q

In [37]:
dff['Timestamp'] = pd.to_datetime(dff["Timestamp"], format="%Y-%m-%d %H:%M:%S")
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4323 entries, 0 to 4322
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Rating              4323 non-null   float64       
 1   Timestamp           4323 non-null   datetime64[ns]
 2   Enrolled            4323 non-null   object        
 3   Progress            4323 non-null   float64       
 4   Questions Asked     4323 non-null   float64       
 5   Questions Answered  4323 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(1)
memory usage: 202.8+ KB


In [40]:
from datetime import date
dff["year"]=dff["Timestamp"].dt.year
dff["month"]=dff["Timestamp"].dt.month
dff['year_diff'] = date.today().year - dff['Timestamp'].dt.year #yılların farkı

# month diff (iki tarih arasındaki ay farkı): yıl farkı + ay farkı
dff['month_diff'] = (date.today().year - dff['Timestamp'].dt.year) * 12 + date.today().month - dff['Timestamp'].dt.month

dff["day_name"]=dff["Timestamp"].dt.day_name() #gün isimleri

In [41]:
dff.head()

Unnamed: 0,Rating,Timestamp,Enrolled,Progress,Questions Asked,Questions Answered,year,month,year_diff,month_diff,day_name
0,5.0,2021-02-05 07:45:55,2021-01-25 15:12:08,5.0,0.0,0.0,2021,2,3,37,Friday
1,5.0,2021-02-04 21:05:32,2021-02-04 20:43:40,1.0,0.0,0.0,2021,2,3,37,Thursday
2,4.5,2021-02-04 20:34:03,2019-07-04 23:23:27,1.0,0.0,0.0,2021,2,3,37,Thursday
3,5.0,2021-02-04 16:56:28,2021-02-04 14:41:29,10.0,0.0,0.0,2021,2,3,37,Thursday
4,4.0,2021-02-04 15:00:24,2020-10-13 03:10:07,10.0,0.0,0.0,2021,2,3,37,Thursday


# Feature Interactions (Özellik Etkileşimleri)


In [43]:
df=load()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
df["NEW_AGE_PCLASS"]=df["Age"]*df["Pclass"]
#aynı standart formda varsayılmıştır
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_AGE_PCLASS
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,66.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,38.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,105.0


In [48]:
df["NEW_FAMILY_SIZE"] = df["SibSp"] + df["Parch"] + 1
#aile büyüklüğü

In [50]:
df.loc[(df['Sex'] == 'male') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngmale'
#genç erkekler

In [51]:
df.loc[(df['Sex'] == 'male') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturemale'
#yetişkin erkekler

In [53]:
df.loc[(df['Sex'] == 'male') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniormale'
#kıdemli erkekler

In [55]:
####################################################
df.loc[(df['Sex'] == 'female') & (df['Age'] <= 21), 'NEW_SEX_CAT'] = 'youngfemale'
#genç kadınlar

In [58]:
df.loc[(df['Sex'] == 'female') & (df['Age'] > 21) & (df['Age'] < 50), 'NEW_SEX_CAT'] = 'maturefemale'
#yetişkin kadınlar

In [60]:
df.loc[(df['Sex'] == 'female') & (df['Age'] >= 50), 'NEW_SEX_CAT'] = 'seniorfemale'
#kıdemli kadınlar

In [61]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,NEW_AGE_PCLASS,NEW_FAMILY_SIZE,NEW_SEX_CAT
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,66.0,2,maturemale
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.283,C85,C,38.0,2,maturefemale
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,78.0,1,maturefemale
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,2,maturefemale
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,105.0,1,maturemale


In [62]:
df.groupby("NEW_SEX_CAT")["Survived"].mean()

NEW_SEX_CAT
maturefemale   0.774
maturemale     0.199
nan            0.294
seniorfemale   0.909
seniormale     0.135
youngfemale    0.679
youngmale      0.250
Name: Survived, dtype: float64