In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Series
A Series is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index.

In [None]:
series_obj = pd.Series([4, 7, -5, 3])
series_obj

In [None]:
series_obj.values

In [None]:
series_obj.index = ["aval", "dovom", "sevom", "charom"]

In [None]:
series_obj["dovom"]

In [None]:
series_obj[series_obj > 3]

In [None]:
np.exp(series_obj)

In [None]:
"aval" in series_obj

In [None]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000, 'Tehran': None}
indx = ["C", 'Ohio', 'Texas']
states = pd.Series(sdata, index=indx)
states

In [None]:
new_states = pd.Series(sdata)
new_states

In [None]:
states + new_states

In [None]:
states[states.isnull()]  = 2000
states

## DataFrame

In [122]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [124]:
df = pd.DataFrame(data, columns=['year', 'state', 'pop'])
df

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [None]:
df.rename(columns={'year': 'sali'}, inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
type(df['sali'])

In [None]:
df.sali

## Import data

In [125]:
iris_df = pd.read_csv('iris.csv')

In [126]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [127]:
iris_df.shape

(150, 5)

In [130]:
iris_df['class'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [131]:
iris_df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [132]:
tips_df = pd.read_csv('tips.csv')

In [133]:
tips_df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [134]:
tips_df.shape

(244, 7)

## Reshaping Data

In [237]:
data_dic = {
            "Name": ['Ali', 'Reza', 'Maryam', 'Ghasem'], 
            "Age": [25, 46, 34,32], 
            "Gender": ['M', 'M', 'F', 'M'], 
            'Smoking': [True, False, False, True]
}

df = pd.DataFrame(my_dic)

In [136]:
df.head()

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
1,Reza,46,M,False
2,Maryam,34,F,False
3,Ghasem,32,M,True


In [161]:
## melt
pd.melt(df, ['Name', 'Age'])

Unnamed: 0,Name,Age,variable,value
0,Ali,25,Gender,M
1,Reza,46,Gender,M
2,Maryam,34,Gender,F
3,Ghasem,32,Gender,M
4,Ali,25,Smoking,True
5,Reza,46,Smoking,False
6,Maryam,34,Smoking,False
7,Ghasem,32,Smoking,True


In [168]:
df_test = pd.DataFrame({'key': ['foo', 'bar', 'baz'], 'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
melted = pd.melt(df_test, ['key'])
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [172]:
df_test

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [176]:
## pivot
x = melted.pivot('key', 'variable', 'value')
x

  x = melted.pivot('key', 'variable', 'value')


variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [145]:
## concat

df2 = pd.DataFrame({"Address": ["Tehran", "Shiraz", np.nan, "Kerman"]})
pd.concat([df, df2], axis=1)

Unnamed: 0,Name,Age,Gender,Smoking,Address
0,Ali,25,M,True,Tehran
1,Reza,46,M,False,Shiraz
2,Maryam,34,F,False,
3,Ghasem,32,M,True,Kerman


In [155]:
df3 = pd.DataFrame({
            "Name": ['Mohsen', 'Mina', 'Arad', 'Davoud'], 
            "Age": [54, 31, 46,32], 
            "Gender": ['M', 'F', 'M', 'M'], 
            'Smoking': [True, np.nan, False, False],
            'A': [1, 2, 3, 4]
})
pd.concat([df, df3])
example = pd.concat([df, df3]).reset_index(drop=True)

In [None]:
## append  (deprecated)
new_row = {'Name':'Ali', 'Age':25, 'Gender':'M', 'Smoking':True}
df.append(new_row, ignore_index=True)

In [151]:
df.drop(columns=['Name'])

Unnamed: 0,Age,Gender,Smoking
0,25,M,True
1,46,M,False
2,34,F,False
3,32,M,True


## Method chaining

In [157]:
df_chain = pd.DataFrame(
        {"a" : [4 ,5, 6],
        "b" : [7, 8, 9],
        "c" : [10, 11, 12]})
print("before \n")
print(df_chain)


df_chain = (pd.melt(df_chain)
    .rename(columns={
                'variable' : 'var',
                'value' : 'val'})
    .query('val >= 8')
)
df_chain

before 

   a  b   c
0  4  7  10
1  5  8  11
2  6  9  12


Unnamed: 0,var,val
4,b,8
5,b,9
6,c,10
7,c,11
8,c,12


## Sort

In [159]:
df.sort_values('Age', ascending=False)

Unnamed: 0,Name,Age,Gender,Smoking
1,Reza,46,M,False
2,Maryam,34,F,False
3,Ghasem,32,M,True
0,Ali,25,M,True


In [160]:
df.sort_values(['Age', 'Name'], ascending=[1, 0])

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
3,Ghasem,32,M,True
2,Maryam,34,F,False
1,Reza,46,M,False


## Subset Rows/Columns

In [180]:
df

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
1,Reza,46,M,False
2,Maryam,34,F,False
3,Ghasem,32,M,True


In [183]:
df[(df.Age > 30) & (df.Gender == "M")]

Unnamed: 0,Name,Age,Gender,Smoking
1,Reza,46,M,False
3,Ghasem,32,M,True


In [178]:
df[df.Age > 35]

Unnamed: 0,Name,Age,Gender,Smoking
1,Reza,46,M,False


In [187]:
df.sample(frac=0.2)

Unnamed: 0,Name,Age,Gender,Smoking
3,Ghasem,32,M,True


In [197]:
# df[~df.duplicated()]
df.drop_duplicates()

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
1,Reza,46,M,False
2,Maryam,34,F,False
3,Ghasem,32,M,True


In [213]:
x = df.iloc[:, 2:]
x.iloc[3, 0] = 'F'
x

Unnamed: 0,Gender,Smoking
0,M,True
1,M,False
2,F,False
3,F,True


In [218]:
df

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
1,Reza,46,M,False
2,Maryam,34,F,False
3,Ghasem,32,M,True


In [234]:
df.index = ['a', 'b', 'c', 'd']
# df.reindex([2, 3, 1, 4])
df.loc['a','Name']

'Ali'

## Handling Missing Data and Cleaning

In [241]:
x = pd.concat([df, pd.DataFrame({"Name": ["Akbar"]})])
x

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25.0,M,True
1,Reza,46.0,M,False
2,Maryam,34.0,F,False
3,Ghasem,32.0,M,True
0,Akbar,,,


In [240]:
x.dropna(inplace = False, axis=0)

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25.0,M,True
1,Reza,46.0,M,False
2,Maryam,34.0,F,False
3,Ghasem,32.0,M,True


In [243]:
mean_age = df["Age"].mean().astype("int8")
x.fillna(mean_age, inplace = False)

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25.0,M,True
1,Reza,46.0,M,False
2,Maryam,34.0,F,False
3,Ghasem,32.0,M,True
0,Akbar,34.0,34,34


In [244]:
x.fillna({"Age": mean_age})

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25.0,M,True
1,Reza,46.0,M,False
2,Maryam,34.0,F,False
3,Ghasem,32.0,M,True
0,Akbar,34.0,,


In [None]:
pd.concat([df] * 2).duplicated()

In [None]:
pd.concat([df] * 2).drop_duplicates()

In [None]:
pd.concat([df] * 2).drop_duplicates(['Age', 'Name'], keep='last')

### Data Transformation

In [None]:
df['Name'].str.lower()

In [104]:
map_names = {"Ali": "AL", "Reza": "RE", "Maryam": "MA", "Ghasem": "GH"}
df['Name'].map(lambda x: map_names[x])

0    AL
1    RE
2    MA
3    GH
Name: Name, dtype: object

## Summarize Data

In [257]:
df.iloc[1, 1] = 25

In [258]:
# Count number of rows with each unique value of variable
df['Age'].value_counts()

25    2
34    1
32    1
Name: Age, dtype: int64

In [261]:
df.nunique()

Name       4
Age        3
Gender     3
Smoking    2
dtype: int64

In [91]:
# # of rownuniquein DataFrame.
len(df)

4

## String Manipulation

In [263]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [264]:
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [108]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [109]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [270]:
df[df['Name'].str.contains('A')]

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True


In [110]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [112]:
import re
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [117]:
data.str.match(pattern, flags=re.IGNORECASE)

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

## Map

In [274]:
df

Unnamed: 0,Name,Age,Gender,Smoking
0,Ali,25,M,True
1,Reza,25,25,False
2,Maryam,34,F,False
3,Ghasem,32,M,True


In [294]:
maped_data = {"M": "10", "F": "01"}

# df["Smoking"].map(lambda x: maped_data[x])
# df["Name"].map(lambda x: x[:2])
df["Gender"].map(maped_data)

# iris_df["total"]

sepal_length
sepal_width
petal_length
petal_width
class
total


In [295]:
for elm in iris_df:
    print(elm)

sepal_length
sepal_width
petal_length
petal_width
class
total


In [None]:
for indx in iris_df.index:
    temp = iris_df["petal_length"][indx] + iris_df["petal_width"][indx]
    if temp > 5:
        print("bozorgtar az 5")
    else:
        print("kochiktar az 5")

## apply

In [286]:
def f(x):
    if x > 30:
        return True
    else:
        return False
    
df["mojaz"] = df["Age"].apply(f)
df

Unnamed: 0,Name,Age,Gender,Smoking,mojaz
0,Ali,25,M,True,False
1,Reza,25,25,False,False
2,Maryam,34,F,False,True
3,Ghasem,32,M,True,True


## Grouping

In [297]:
tips_df.head(4)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2


In [303]:
tips_df.groupby('sex').total_bill.mean()

sex
Female    18.056897
Male      20.744076
Name: total_bill, dtype: float64

In [304]:
tips_df.groupby(['sex', 'time']).tip.mean()

sex     time  
Female  Dinner    3.002115
        Lunch     2.582857
Male    Dinner    3.144839
        Lunch     2.882121
Name: tip, dtype: float64

In [121]:
tips_df['normal bill'] = tips_df['total_bill'] / tips_df.total_bill.max()
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,normal bill
0,16.99,1.01,Female,No,Sun,Dinner,2,0.334383
1,10.34,1.66,Male,No,Sun,Dinner,3,0.203503
2,21.01,3.50,Male,No,Sun,Dinner,3,0.413501
3,23.68,3.31,Male,No,Sun,Dinner,2,0.466050
4,24.59,3.61,Female,No,Sun,Dinner,4,0.483960
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.571344
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.534934
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.446172
242,17.82,1.75,Male,No,Sat,Dinner,2,0.350718
