# Pandas Basics

In [1]:
import pandas as pd


# Pandas is a powerful Python library for data analysis and manipulation.
# It provides two main data structures:

# Series → one-dimensional data (like a column).

# DataFrame → two-dimensional data (like a table).

# With Pandas, you can:

# Load and save data (CSV, Excel, SQL, JSON).

# Clean and organize datasets.

# Summarize and analyze data quickly.

# Group, merge, and filter information easily.

# Simply: Pandas = Excel + Python, but faster and smarter

In [6]:
data = {
    "Name": ["Ali", "Sara", "Omar", "Lina", "Youssef","Mohamed"],
    "Age": [23, 21, 25, 22, 24,21],
    "Country": ["Egypt", "Jordan", "Egypt", "Lebanon", "Morocco","Egypt"],
    "Score": [85, 90, 78, 88, 92, 89]
}

# DataFrame --> 2 dimentions (2D)
# keys --> Name , Age , Country , Score
# values --> inside array

df = pd.DataFrame(data)
print(df)

      Name  Age  Country  Score
0      Ali   23    Egypt     85
1     Sara   21   Jordan     90
2     Omar   25    Egypt     78
3     Lina   22  Lebanon     88
4  Youssef   24  Morocco     92
5  Mohamed   21    Egypt     89


In [25]:
from operator import index
studentname = ["Ali", "Sara", "Omar", "Lina", "Youssef","Mohamed"]
Age = [[23,45,56,45], [21,34,213,4], [25,45,23,11], [22,9,534,3], [22,9,534,3],[22,9,534,3]]

# Series --> one dimention (1D) just tow tables (index  (main), data)

x = pd.Series(data=Age, index=studentname )
print(x)

Ali        [23, 45, 56, 45]
Sara       [21, 34, 213, 4]
Omar       [25, 45, 23, 11]
Lina        [22, 9, 534, 3]
Youssef     [22, 9, 534, 3]
Mohamed     [22, 9, 534, 3]
dtype: object


In [27]:
x["Mohamed"]
x["Omar"]

# to get names from ages must be change the age to --> index (main)

# x[25]

[25, 45, 23, 11]

In [28]:
y = pd.Series(data)
print(y)

Name             [Ali, Sara, Omar, Lina, Youssef, Mohamed]
Age                               [23, 21, 25, 22, 24, 21]
Country    [Egypt, Jordan, Egypt, Lebanon, Morocco, Egypt]
Score                             [85, 90, 78, 88, 92, 89]
dtype: object


In [45]:
d = pd.read_csv("StudentsPerformance.csv")
print(d)

     gender race/ethnicity parental level of education         lunch  \
0    female        group B           bachelor's degree      standard   
1    female        group C                some college      standard   
2    female        group B             master's degree      standard   
3      male        group A          associate's degree  free/reduced   
4      male        group C                some college      standard   
..      ...            ...                         ...           ...   
995  female        group E             master's degree      standard   
996    male        group C                 high school  free/reduced   
997  female        group C                 high school  free/reduced   
998  female        group D                some college      standard   
999  female        group D                some college  free/reduced   

    test preparation course  math score  reading score  writing score  
0                      none          72             72         

In [50]:
d.head(6)
d.tail(6)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
994,male,group A,high school,standard,none,63,63,62
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


In [51]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [52]:
d.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [56]:
print (d.loc[2])

gender                                  female
race/ethnicity                         group B
parental level of education    master's degree
lunch                                 standard
test preparation course                   none
math score                                  90
reading score                               95
writing score                               93
Name: 2, dtype: object


# Clean data

In [63]:
d.isna().sum()

# This checks for missing values (NaN) in your DataFrame.
# It returns a DataFrame of True / False:
# True → the cell is empty (NaN).
# False → the cell has a value.

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


In [64]:
d.duplicated().sum()

# Count how many duplicate rows exist in my DataFrame
# True → this row is a duplicate of a previous one.
# False → this row is unique (first occurrence).

np.int64(0)

In [76]:
d.fillna(d.mean,inplace=True)
d

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [81]:
d.dropna(inplace=True)
d

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [82]:
d.drop_duplicates(inplace=True)
d

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


# Apply Function

In [106]:

import pandas as pd

d = pd.read_csv("StudentsPerformance.csv")   # رجّع DataFrame
type(d)

d[d['math score'] <= 70]


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
1,female,group C,some college,standard,completed,69,90,88
3,male,group A,associate's degree,free/reduced,none,47,57,44
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50
...,...,...,...,...,...,...,...,...
993,female,group D,bachelor's degree,free/reduced,none,62,72,74
994,male,group A,high school,standard,none,63,63,62
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65


In [110]:
def math_5(marks):
  return(marks+5)

In [111]:
d['math score'].apply(math_5)

Unnamed: 0,math score
0,77
1,74
2,95
3,52
4,81
...,...
995,93
996,67
997,64
998,73


In [113]:
d.sort_values(by='math score',ascending=False)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
451,female,group E,some college,standard,none,100,92,97
458,female,group E,bachelor's degree,standard,none,100,100,100
962,female,group E,associate's degree,standard,none,100,100,100
149,male,group E,associate's degree,free/reduced,completed,100,100,93
623,male,group A,some college,standard,completed,100,96,86
...,...,...,...,...,...,...,...,...
145,female,group C,some college,free/reduced,none,22,39,33
787,female,group B,some college,standard,none,19,38,32
17,female,group B,some high school,free/reduced,none,18,32,28
980,female,group B,high school,free/reduced,none,8,24,23


In [115]:
d["math score"].unique()

array([ 72,  69,  90,  47,  76,  71,  88,  40,  64,  38,  58,  65,  78,
        50,  18,  46,  54,  66,  44,  74,  73,  67,  70,  62,  63,  56,
        97,  81,  75,  57,  55,  53,  59,  82,  77,  33,  52,   0,  79,
        39,  45,  60,  61,  41,  49,  30,  80,  42,  27,  43,  68,  85,
        98,  87,  51,  99,  84,  91,  83,  89,  22, 100,  96,  94,  48,
        35,  34,  86,  92,  37,  28,  24,  26,  95,  36,  29,  32,  93,
        19,  23,   8])