In [1]:
import pandas as pd

## Series

In [3]:
pd_series = pd.Series([1, 2, 3, 4, 5, 6])

In [4]:
pd_series

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6


## DataFrame

In [7]:
data = {
    'name': ['ahmed', 'mahmoud'],
    'age': [25, 22],
    'country': ['Egy', 'Egy']
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,country
0,ahmed,25,Egy
1,mahmoud,22,Egy


In [8]:
df.head()

Unnamed: 0,name,age,country
0,ahmed,25,Egy
1,mahmoud,22,Egy


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     2 non-null      object
 1   age      2 non-null      int64 
 2   country  2 non-null      object
dtypes: int64(1), object(2)
memory usage: 180.0+ bytes


In [10]:
df.describe()

Unnamed: 0,age
count,2.0
mean,23.5
std,2.12132
min,22.0
25%,22.75
50%,23.5
75%,24.25
max,25.0


In [11]:
df['age']

Unnamed: 0,age
0,25
1,22


In [13]:
filtered = df[df['name'] == 'mahmoud']
filtered

Unnamed: 0,name,age,country
1,mahmoud,22,Egy


In [17]:
# label based indexing
selected = df.loc[0:1, ['name', 'country']]
selected

Unnamed: 0,name,country
0,ahmed,Egy
1,mahmoud,Egy


In [18]:
# index based selection
selected = df.iloc[0:1, 0:1]
selected

Unnamed: 0,name
0,ahmed


In [21]:
df['grad_year'] = [2021, 2025]
df

Unnamed: 0,name,age,country,birth_year,grad_year
0,ahmed,25,Egy,2000,2021
1,mahmoud,22,Egy,2003,2025


In [23]:
df['birth_year'] = 2025 - df['age']
df

Unnamed: 0,name,age,country,birth_year,grad_year
0,ahmed,25,Egy,2000,2021
1,mahmoud,22,Egy,2003,2025


In [24]:
df = df.drop('grad_year', axis = 1)

In [25]:
df

Unnamed: 0,name,age,country,birth_year
0,ahmed,25,Egy,2000
1,mahmoud,22,Egy,2003


In [34]:
data_na = {
    'name': ['ahmed', 'mahmoud', 'ali', 'adham'],
    'age': [25, 22, None, 30],
    'country': ['Egy', 'Egy', 'KSA', None]
}
df_na = pd.DataFrame(data_na)
df_na

Unnamed: 0,name,age,country
0,ahmed,25.0,Egy
1,mahmoud,22.0,Egy
2,ali,,KSA
3,adham,30.0,


In [35]:
df_na.isnull()

Unnamed: 0,name,age,country
0,False,False,False
1,False,False,False
2,False,True,False
3,False,False,True


In [36]:
df_na.isna().sum()

Unnamed: 0,0
name,0
age,1
country,1


In [37]:
df_na.isna().sum().sum()

2

In [38]:
filled_df = df_na.fillna('unknown')

In [39]:
filled_df

Unnamed: 0,name,age,country
0,ahmed,25.0,Egy
1,mahmoud,22.0,Egy
2,ali,unknown,KSA
3,adham,30.0,unknown


In [40]:
clean_df = df_na.dropna(axis = 0)

In [41]:
clean_df

Unnamed: 0,name,age,country
0,ahmed,25.0,Egy
1,mahmoud,22.0,Egy


In [45]:
def b_calc(x):
  return x + 200

clean_df['Salary'] = [51551, 48905]
clean_df['Salary_b'] = clean_df['Salary'].apply(b_calc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Salary'] = [51551, 48905]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Salary_b'] = clean_df['Salary'].apply(b_calc)


In [46]:
clean_df

Unnamed: 0,name,age,country,Salary,Salary_b
0,ahmed,25.0,Egy,51551,51751
1,mahmoud,22.0,Egy,48905,49105


## Read from CSV

In [48]:
df = pd.read_csv('/content/Grades_Short.csv')
df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62
5,Tarik,31.0,1,19.0,19,1,8.0,24.0,B,87452
6,Malik,31.5,1,20.0,21,1,9.0,36.0,A,9374


In [50]:
df.head(2)

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284


In [51]:
df.shape

(7, 10)

In [52]:
df.dtypes

Unnamed: 0,0
Name,object
Previous_Part,float64
Participation1,int64
Mini_Exam1,float64
Mini_Exam2,int64
Participation2,int64
Mini_Exam3,float64
Final,float64
Grade,object
ID,int64


In [53]:
df.columns

Index(['Name', 'Previous_Part', 'Participation1', 'Mini_Exam1', 'Mini_Exam2',
       'Participation2', 'Mini_Exam3', 'Final', 'Grade', 'ID'],
      dtype='object')

In [54]:
df.index

RangeIndex(start=0, stop=7, step=1)

In [56]:
name_series = df['Name']
name_series

Unnamed: 0,Name
0,Jake
1,Joe
2,Susan
3,Sol
4,Chris
5,Tarik
6,Malik


In [58]:
df.loc[0:4, 'Name':'Mini_Exam1']

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1
0,Jake,32.0,1,19.5
1,Joe,32.0,1,20.0
2,Susan,30.0,1,19.0
3,Sol,31.0,1,22.0
4,Chris,30.0,1,19.0


In [60]:
df.iloc[0:5, 0:4]

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1
0,Jake,32.0,1,19.5
1,Joe,32.0,1,20.0
2,Susan,30.0,1,19.0
3,Sol,31.0,1,22.0
4,Chris,30.0,1,19.0


## Built-in Functions

In [61]:
df['Final'].mean()

32.214285714285715

In [62]:
df['Final'].max()

36.0

In [63]:
df['Final'].min()

24.0

In [64]:
df.describe()

Unnamed: 0,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,ID
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,31.071429,1.0,19.785714,17.857143,1.0,11.0,32.214286,29111.0
std,0.838082,0.0,1.074598,2.734262,0.0,2.217356,3.828154,41131.08167
min,30.0,1.0,19.0,13.0,1.0,8.0,24.0,62.0
25%,30.5,1.0,19.0,16.5,1.0,9.5,32.5,4260.5
50%,31.0,1.0,19.5,19.0,1.0,10.5,33.0,7625.0
75%,31.75,1.0,20.0,19.5,1.0,12.75,33.75,48413.0
max,32.0,1.0,22.0,21.0,1.0,14.0,36.0,90743.0


In [66]:
df['Grade'].value_counts()

Unnamed: 0_level_0,count
Grade,Unnamed: 1_level_1
A,5
A-,1
B,1


In [69]:
df['Participation1'].value_counts()

Unnamed: 0_level_0,count
Participation1,Unnamed: 1_level_1
1,7


In [70]:
df['Grade'].unique()

array(['A', 'A-', 'B'], dtype=object)

## Create new column

In [71]:
df['final_perc'] = df['Final'] / 36
df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID,final_perc
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743,0.916667
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284,0.888889
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625,0.916667
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237,0.944444
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62,0.930556
5,Tarik,31.0,1,19.0,19,1,8.0,24.0,B,87452,0.666667
6,Malik,31.5,1,20.0,21,1,9.0,36.0,A,9374,1.0


## Deleting Coln

In [72]:
del df['final_perc']
df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,ID
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,90743
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A,7284
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-,7625
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A,1237
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A,62
5,Tarik,31.0,1,19.0,19,1,8.0,24.0,B,87452
6,Malik,31.5,1,20.0,21,1,9.0,36.0,A,9374


In [73]:
df.drop('ID', axis = 1, inplace = True)
df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A
1,Joe,32.0,1,20.0,16,1,14.0,32.0,A
2,Susan,30.0,1,19.0,19,1,10.5,33.0,A-
3,Sol,31.0,1,22.0,13,1,13.0,34.0,A
4,Chris,30.0,1,19.0,17,1,12.5,33.5,A
5,Tarik,31.0,1,19.0,19,1,8.0,24.0,B
6,Malik,31.5,1,20.0,21,1,9.0,36.0,A


## DF with Missing values

In [76]:
missied = pd.read_csv('/content/Missing_Data.csv')
missied

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,not available,1,12.5,33.5,A,72


In [77]:
all_missied = pd.read_csv('/content/Missing_Data.csv',
                          na_values=['NaN', "not available"])
all_missied

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,-1,19.0,,1,12.5,33.5,A,72


In [78]:
specfied_missied = pd.read_csv('/content/Missing_Data.csv',
                          na_values={
                              'Mini_Exam2': 'not available',
                              'Participation1': -1,
                          })
specfied_missied

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,,19.0,,1,12.5,33.5,A,72


In [81]:
specfied_missied.isnull()

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,True,False,False,False,False,False


In [82]:
specfied_missied.isnull().sum()

Unnamed: 0,0
Name,0
Previous_Part,1
Participation1,1
Mini_Exam1,0
Mini_Exam2,1
Participation2,0
Mini_Exam3,0
Final,0
Grade,0
Temp,0


In [84]:
specfied_missied.isnull().sum().sum()

3

In [86]:
cleaned_df = specfied_missied.dropna(axis=0)
cleaned_df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34


In [87]:
filled_df = specfied_missied.fillna(0)
filled_df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1
1,Joe,0.0,1.0,20.0,16.0,1,14.0,32.0,A,23
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34
3,Chris,30.0,0.0,19.0,0.0,1,12.5,33.5,A,72


In [89]:
def apply_func(x):
  return x*2

filled_df['double_Final'] = filled_df['Final'].apply(apply_func)
filled_df

Unnamed: 0,Name,Previous_Part,Participation1,Mini_Exam1,Mini_Exam2,Participation2,Mini_Exam3,Final,Grade,Temp,double_Final
0,Jake,32.0,1.0,19.5,20.0,1,10.0,33.0,A,-1,66.0
1,Joe,0.0,1.0,20.0,16.0,1,14.0,32.0,A,23,64.0
2,Sol,31.0,1.0,22.0,13.0,1,13.0,34.0,A,34,68.0
3,Chris,30.0,0.0,19.0,0.0,1,12.5,33.5,A,72,67.0
