In [1]:
import pandas as pd
import numpy as np

### Series

One dimensional ndarray with axis labels.

In [6]:
my_dict = {"Rose" : 25, "Amy" : 28, "Rory" : 27}

In [7]:
pd.Series(my_dict)

Rose    25
Amy     28
Rory    27
dtype: int64

In [8]:
ages = [25, 28, 27]
names = ["Rose", "Amy", "Rory"]

In [9]:
pd.Series(data = ages, index = names)

Rose    25
Amy     28
Rory    27
dtype: int64

In [10]:
pd.Series(["Rose", "Amy", "Rory"], (25, 28, 27))

25    Rose
28     Amy
27    Rory
dtype: object

In [11]:
game_1 = pd.Series([20, 10, 8], ["Rose", "Amy", "Rory"])
game_2 = pd.Series([10, 5, 1], ["Rose", "Amy", "Rory"])

In [12]:
game_1["Rose"]

20

In [13]:
last_game = game_1 + game_2

In [14]:
last_game

Rose    30
Amy     15
Rory     9
dtype: int64

In [15]:
seri_1 = pd.Series([1, 2, 3, 4], ['a', 'b', 'c', 'd'])
seri_2 = pd.Series([3, 4 , 5, 7], ['a', 'g', 'f', 'd'])

In [16]:
seri_1 + seri_2

a     4.0
b     NaN
c     NaN
d    11.0
f     NaN
g     NaN
dtype: float64

### Data Frame

Two dimensional, size-mutable, potentially heterogeneous tabular data.

In [17]:
data = np.random.randn(4,3)

In [18]:
df = pd.DataFrame(data = data)

In [19]:
df

Unnamed: 0,0,1,2
0,-0.920331,-0.197904,-0.040082
1,1.003017,-1.860772,0.737554
2,-0.520142,-0.9201,-1.312445
3,0.86781,-0.413877,-0.138388


In [20]:
df[0] # first column of the df

0   -0.920331
1    1.003017
2   -0.520142
3    0.867810
Name: 0, dtype: float64

In [21]:
new_df = pd.DataFrame(data, index=['Rose', 'Amy', 'Rory', 'Clara'],
                     columns=['Fee', 'Age', 'Hours'])

In [22]:
new_df

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,1.003017,-1.860772,0.737554
Rory,-0.520142,-0.9201,-1.312445
Clara,0.86781,-0.413877,-0.138388


In [23]:
new_df['Age']

Rose    -0.197904
Amy     -1.860772
Rory    -0.920100
Clara   -0.413877
Name: Age, dtype: float64

In [24]:
new_df[['Age', 'Fee']]

Unnamed: 0,Age,Fee
Rose,-0.197904,-0.920331
Amy,-1.860772,1.003017
Rory,-0.9201,-0.520142
Clara,-0.413877,0.86781


- .loc[ ] is primarily label based.
- Access a group of rows and columns by labels.

In [42]:
new_df.loc["Rose"]

Fee     -0.920331
Age     -0.197904
Hours   -0.040082
Name: Rose, dtype: float64

- iloc[ ] is index based.

In [40]:
new_df.iloc[0]

Fee     -0.920331
Age     -0.197904
Hours   -0.040082
Name: Rose, dtype: float64

In [27]:
new_df["Retire Age"] = new_df['Age'] + new_df['Age']

In [28]:
new_df

Unnamed: 0,Fee,Age,Hours,Retire Age
Rose,-0.920331,-0.197904,-0.040082,-0.395808
Amy,1.003017,-1.860772,0.737554,-3.721543
Rory,-0.520142,-0.9201,-1.312445,-1.840199
Clara,0.86781,-0.413877,-0.138388,-0.827754


In [29]:
new_df.drop('Retire Age', axis=1) # drop Retire Age column.

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,1.003017,-1.860772,0.737554
Rory,-0.520142,-0.9201,-1.312445
Clara,0.86781,-0.413877,-0.138388


In [30]:
new_df.drop("Clara", axis=0) # with or without axis, same result.

Unnamed: 0,Fee,Age,Hours,Retire Age
Rose,-0.920331,-0.197904,-0.040082,-0.395808
Amy,1.003017,-1.860772,0.737554,-3.721543
Rory,-0.520142,-0.9201,-1.312445,-1.840199


In [31]:
new_df

Unnamed: 0,Fee,Age,Hours,Retire Age
Rose,-0.920331,-0.197904,-0.040082,-0.395808
Amy,1.003017,-1.860772,0.737554,-3.721543
Rory,-0.520142,-0.9201,-1.312445,-1.840199
Clara,0.86781,-0.413877,-0.138388,-0.827754


In [32]:
new_df.drop("Retire Age", axis=1, inplace=True)

In [33]:
new_df

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,1.003017,-1.860772,0.737554
Rory,-0.520142,-0.9201,-1.312445
Clara,0.86781,-0.413877,-0.138388


In [34]:
new_df.loc["Clara"]["Fee"]

0.8678096883740088

In [37]:
new_df.loc["Clara", "Fee"]

0.8678096883740088

In [43]:
new_df < 0

Unnamed: 0,Fee,Age,Hours
Rose,True,True,True
Amy,False,True,False
Rory,True,True,True
Clara,False,True,True


In [44]:
new_df[new_df < 0]

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,,-1.860772,
Rory,-0.520142,-0.9201,-1.312445
Clara,,-0.413877,-0.138388


In [45]:
new_df[new_df["Age"] > 0]

Unnamed: 0,Fee,Age,Hours


In [46]:
new_df

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,1.003017,-1.860772,0.737554
Rory,-0.520142,-0.9201,-1.312445
Clara,0.86781,-0.413877,-0.138388


### Changing Index

In [48]:
new_df.reset_index()

Unnamed: 0,index,Fee,Age,Hours
0,Rose,-0.920331,-0.197904,-0.040082
1,Amy,1.003017,-1.860772,0.737554
2,Rory,-0.520142,-0.9201,-1.312445
3,Clara,0.86781,-0.413877,-0.138388


In [49]:
new_df

Unnamed: 0,Fee,Age,Hours
Rose,-0.920331,-0.197904,-0.040082
Amy,1.003017,-1.860772,0.737554
Rory,-0.520142,-0.9201,-1.312445
Clara,0.86781,-0.413877,-0.138388


In [50]:
new_index = ["R", "A", "R", "C"]

In [52]:
new_df["New Index"] = new_index

In [53]:
new_df

Unnamed: 0,Fee,Age,Hours,New Index
Rose,-0.920331,-0.197904,-0.040082,R
Amy,1.003017,-1.860772,0.737554,A
Rory,-0.520142,-0.9201,-1.312445,R
Clara,0.86781,-0.413877,-0.138388,C


In [54]:
new_df.set_index("New Index", inplace=True)

In [55]:
new_df

Unnamed: 0_level_0,Fee,Age,Hours
New Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R,-0.920331,-0.197904,-0.040082
A,1.003017,-1.860772,0.737554
R,-0.520142,-0.9201,-1.312445
C,0.86781,-0.413877,-0.138388


In [56]:
new_df.loc["R"]

Unnamed: 0_level_0,Fee,Age,Hours
New Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R,-0.920331,-0.197904,-0.040082
R,-0.520142,-0.9201,-1.312445


### Multi-Index

In [57]:
first_index = ["Simpson", "Simpson", "Simpson", "South Park", "South Park",
              "South Park"]

In [58]:
inner_index = ["Homer", "Bart", "Marge", "Cartman", "Kenny", "Kyle"]

In [59]:
merge_index = list(zip(first_index, inner_index))

In [60]:
merge_index

[('Simpson', 'Homer'),
 ('Simpson', 'Bart'),
 ('Simpson', 'Marge'),
 ('South Park', 'Cartman'),
 ('South Park', 'Kenny'),
 ('South Park', 'Kyle')]

In [61]:
merge_index = pd.MultiIndex.from_tuples(merge_index)

In [62]:
merge_index

MultiIndex([(   'Simpson',   'Homer'),
            (   'Simpson',    'Bart'),
            (   'Simpson',   'Marge'),
            ('South Park', 'Cartman'),
            ('South Park',   'Kenny'),
            ('South Park',    'Kyle')],
           )

In [63]:
cartoonList = [[40, "A"], [30, "B"], [40, "C"], [5, "D"], [30, "E"], 
              [20, "F"]]

In [64]:
cartoonarray = np.array(cartoonList)

In [67]:
cartoon_df = pd.DataFrame(cartoonList, index=merge_index, columns=["Age", "Job"])

In [68]:
cartoon_df

Unnamed: 0,Unnamed: 1,Age,Job
Simpson,Homer,40,A
Simpson,Bart,30,B
Simpson,Marge,40,C
South Park,Cartman,5,D
South Park,Kenny,30,E
South Park,Kyle,20,F


In [72]:
cartoon_df.loc["Simpson"]

Unnamed: 0,Age,Job
Homer,40,A
Bart,30,B
Marge,40,C


In [73]:
cartoon_df.loc["South Park"].loc["Kenny"]

Age    30
Job     E
Name: Kenny, dtype: object

In [76]:
cartoon_df.index.names = ["Cartoon", "Name"]

In [77]:
cartoon_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Job
Cartoon,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Simpson,Homer,40,A
Simpson,Bart,30,B
Simpson,Marge,40,C
South Park,Cartman,5,D
South Park,Kenny,30,E
South Park,Kyle,20,F


In [79]:
import pandas as pd
import numpy as np

In [104]:
dict_data={"Istanbul" : [30,29, np.nan],"Ankara":[20,np.nan,25],
             "Izmir":[40,39,38]}
weatherDataFrame=pd.DataFrame(dict_data)

In [93]:
weatherDataFrame

Unnamed: 0,Istanbul,Ankara,Izmir
0,30.0,20.0,40
1,29.0,,39
2,,25.0,38


In [95]:
weatherDataFrame.dropna()

Unnamed: 0,Istanbul,Ankara,Izmir
0,30.0,20.0,40


In [97]:
# nothing has changed
weatherDataFrame

Unnamed: 0,Istanbul,Ankara,Izmir
0,30.0,20.0,40
1,29.0,,39
2,,25.0,38


In [99]:
weatherDataFrame.dropna(axis=1) # dropna via column

Unnamed: 0,Izmir
0,40
1,39
2,38


In [106]:
new_data={"Istanbul" : [30,29, np.nan],"Ankara":[20,np.nan,25],
             "Izmir":[40,39,38], "Antalya" : [45, np.nan, np.nan]}
newDataFrame=pd.DataFrame(new_data)

In [107]:
newDataFrame

Unnamed: 0,Istanbul,Ankara,Izmir,Antalya
0,30.0,20.0,40,45.0
1,29.0,,39,
2,,25.0,38,


In [108]:
newDataFrame.dropna(axis=1, thresh=2) 

Unnamed: 0,Istanbul,Ankara,Izmir
0,30.0,20.0,40
1,29.0,,39
2,,25.0,38


In [109]:
newDataFrame.fillna(20)

Unnamed: 0,Istanbul,Ankara,Izmir,Antalya
0,30.0,20.0,40,45.0
1,29.0,20.0,39,20.0
2,20.0,25.0,38,20.0


### GroupBy

In [117]:
fee_dict = {"Deparments" : ["Software", "Software", "Marketing","Marketing",
                           "Law", "Law"],
           "Employee" : ["Rose", "Amy", "Rory", "Clara", "River", "Melodie"],
           "Fee" : [4000, 5000, 6000, 5500, 4560, 5540]}

In [118]:
fee_df = pd.DataFrame(fee_dict)

In [119]:
fee_df

Unnamed: 0,Deparments,Employee,Fee
0,Software,Rose,4000
1,Software,Amy,5000
2,Marketing,Rory,6000
3,Marketing,Clara,5500
4,Law,River,4560
5,Law,Melodie,5540


In [121]:
group = fee_df.groupby("Deparments")

In [122]:
group.count()

Unnamed: 0_level_0,Employee,Fee
Deparments,Unnamed: 1_level_1,Unnamed: 2_level_1
Law,2,2
Marketing,2,2
Software,2,2


In [123]:
group.mean()

Unnamed: 0_level_0,Fee
Deparments,Unnamed: 1_level_1
Law,5050
Marketing,5750
Software,4500


In [124]:
group.min()

Unnamed: 0_level_0,Employee,Fee
Deparments,Unnamed: 1_level_1,Unnamed: 2_level_1
Law,Melodie,4560
Marketing,Clara,5500
Software,Amy,4000


In [125]:
group.max()

Unnamed: 0_level_0,Employee,Fee
Deparments,Unnamed: 1_level_1,Unnamed: 2_level_1
Law,River,5540
Marketing,Rory,6000
Software,Rose,5000


In [126]:
group.describe()

Unnamed: 0_level_0,Fee,Fee,Fee,Fee,Fee,Fee,Fee,Fee
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Deparments,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Law,2.0,5050.0,692.964646,4560.0,4805.0,5050.0,5295.0,5540.0
Marketing,2.0,5750.0,353.553391,5500.0,5625.0,5750.0,5875.0,6000.0
Software,2.0,4500.0,707.106781,4000.0,4250.0,4500.0,4750.0,5000.0


### Concat

In [134]:
dict1 = {"Name" : ["Ahmet", "Mehmet", "Zeynep", "Atıl"],
        "Sport" : ["Jog", "Swim", "Jog", "Basketboll"],
        "Cals" : [100, 200, 300, 400]}

In [135]:
df = pd.DataFrame(dict1, index=[0, 1, 2, 3])

In [129]:
df

Unnamed: 0,Name,Sport,Cals
0,Ahmet,Jog,100
1,Mehmet,Swim,200
2,Zeynep,Jog,300
3,Atıl,Basketboll,400


In [130]:
dict2 = {"Name" : ["Osman", "Levent", "Atlas", "Fatma"],
        "Sport" : ["Jog", "Swim", "Jog", "Basketboll"],
        "Cals" : [200, 100, 50, 300]}

In [131]:
df2 = pd.DataFrame(dict2, index=[4, 5, 6, 7])

In [133]:
dict3 = {"Name" : ["Ayse", "Mahmut", "Duygu", "Nur"],
        "Sport" : ["Jog", "Swim", "Badminton", "Tenis"],
        "Cals" : [300, 400, 500, 250]}

In [136]:
df3 = pd.DataFrame(dict3, index=[8, 9, 10, 11])

In [137]:
df3

Unnamed: 0,Name,Sport,Cals
8,Ayse,Jog,300
9,Mahmut,Swim,400
10,Duygu,Badminton,500
11,Nur,Tenis,250


In [138]:
df2

Unnamed: 0,Name,Sport,Cals
4,Osman,Jog,200
5,Levent,Swim,100
6,Atlas,Jog,50
7,Fatma,Basketboll,300


In [140]:
df

Unnamed: 0,Name,Sport,Cals
0,Ahmet,Jog,100
1,Mehmet,Swim,200
2,Zeynep,Jog,300
3,Atıl,Basketboll,400


In [141]:
pd.concat([df, df2, df3])

Unnamed: 0,Name,Sport,Cals
0,Ahmet,Jog,100
1,Mehmet,Swim,200
2,Zeynep,Jog,300
3,Atıl,Basketboll,400
4,Osman,Jog,200
5,Levent,Swim,100
6,Atlas,Jog,50
7,Fatma,Basketboll,300
8,Ayse,Jog,300
9,Mahmut,Swim,400


In [152]:
merge1 = {"Name" : ["Ahmet", "Mehmet", "Zeynep", "Atıl"],
        "Sport" : ["Jog", "Swim", "Jog", "Basketboll"]
        }

In [153]:
mergedf1 = pd.DataFrame(merge1, index=['a', 'b', 'c', 'd'])

In [154]:
merge2 = {"Name" : ["Ahmet", "Mehmet", "Zeynep", "Atıl"],
        "Cals" : [100, 200, 300, 400]}

In [157]:
mergedf2 = pd.DataFrame(merge2)

In [158]:
pd.merge(mergedf1, mergedf2, on='Name')

Unnamed: 0,Name,Sport,Cals
0,Ahmet,Jog,100
1,Mehmet,Swim,200
2,Zeynep,Jog,300
3,Atıl,Basketboll,400


In [160]:
fee = {"Name" : ["Atıl", "Zeynep", "Mehmet","Ahmet"],
           "Department" : ["Software", "Sale", "Marketting", "Software"],
           "Fee" : [5000, 5000, 6000, 5500]}

In [162]:
feedf = pd.DataFrame(fee)

In [163]:
feedf

Unnamed: 0,Name,Department,Fee
0,Atıl,Software,5000
1,Zeynep,Sale,5000
2,Mehmet,Marketting,6000
3,Ahmet,Software,5500


In [165]:
feedf["Department"].unique()

array(['Software', 'Sale', 'Marketting'], dtype=object)

In [166]:
feedf["Department"].nunique()

3

In [167]:
feedf["Department"].value_counts()

Software      2
Marketting    1
Sale          1
Name: Department, dtype: int64

In [169]:
def bruttennete(maas):
    return maas * 0.66

In [170]:
feedf["Fee"].apply(bruttennete) # you can apply your function to the df.

0    3300.0
1    3300.0
2    3960.0
3    3630.0
Name: Fee, dtype: float64

In [172]:
feedf.isnull()

Unnamed: 0,Name,Department,Fee
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [177]:
data = {"Characters": ["South Park", "South Park", "Simpson", "Simpson", "Simpson"],
       "Name" : ["Cartman", "Kenny", "Homer", "Bart", "Bart"],
           "Age": [9, 10, 50, 20, 10]}

In [178]:
df = pd.DataFrame(data)

In [179]:
df

Unnamed: 0,Characters,Name,Age
0,South Park,Cartman,9
1,South Park,Kenny,10
2,Simpson,Homer,50
3,Simpson,Bart,20
4,Simpson,Bart,10


In [182]:
df.pivot_table(values="Age", index=["Characters", "Name"], aggfunc=np.sum)
# by default, mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Characters,Name,Unnamed: 2_level_1
Simpson,Bart,30
Simpson,Homer,50
South Park,Cartman,9
South Park,Kenny,10


In [7]:
df = pd.read_excel("Kitap1.xlsx", skiprows=1)

In [10]:
new_df = df.dropna()

In [13]:
new_df

Unnamed: 0.1,Unnamed: 0,Maaş,Yas,Depatman
0,Atıl,100,40.0,Yazılım
1,Zeynep,150,59.0,Satış
3,Ahmet,50,10.0,Pazarlama
5,Fatma,200,25.0,Hukuk
