<a href="https://colab.research.google.com/github/ManaggZZ/Pandas_self/blob/main/pandas_self_code_notebook/jupyter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### SERIES

In [2]:
import pandas as pd

# Initializing a Series from a list
data = [1, 2, 3, 4, 5]
series_from_list = pd.Series(data)
print(series_from_list)

# Initializing a Series from a dictionary
data = {'a': 1, 'b': 2, 'c': 3}
series_from_dict = pd.Series(data)
print(series_from_dict)

# Initializing a Series with custom index
data = [1, 2, 3, 4, 5]
new_index = ['a', 'b', 'c', 'd', 'e']
series_custom_index = pd.Series(data, index=new_index)
print(series_custom_index)


0    1
1    2
2    3
3    4
4    5
dtype: int64
a    1
b    2
c    3
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64


In [3]:
# Basic operations
series_a = pd.Series([1, 2, 3])
series_b = pd.Series([4, 5, 6])
sum_series = series_a + series_b
print(sum_series)


0    5
1    7
2    9
dtype: int64


In [4]:
# Index Alignment
series_a = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
series_b = pd.Series([4, 5, 6], index=['b', 'c', 'd'])
sum_series = series_a + series_b
print(sum_series)


a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64


### DataFrame

In [5]:
# Initializing a DataFrame from a dictionary
data = {'Name': ['John', 'Alice', 'Bob'],
        'Age': [25, 30, 35],
        'City': ['New York', 'Los Angeles', 'Chicago']}
df = pd.DataFrame(data)
print(df)

print()
# Initializing a DataFrame from a list of lists
data = [['John', 25, 'New York'],
        ['Alice', 30, 'Los Angeles'],
        ['Bob', 35, 'Chicago']]
new_columns = ['Name', 'Age', 'City']
df = pd.DataFrame(data, columns=new_columns)
print(df)

    Name  Age         City
0   John   25     New York
1  Alice   30  Los Angeles
2    Bob   35      Chicago

    Name  Age         City
0   John   25     New York
1  Alice   30  Los Angeles
2    Bob   35      Chicago


In [6]:
# Using Series in 2D data
data = {'Name': ['John', 'Alice', 'Bob'],
        'Age': [25, 30, 35],
        'City': ['New York', 'Los Angeles', 'Chicago']}
df_series = pd.Series(data)
print(df_series)

Name                  [John, Alice, Bob]
Age                         [25, 30, 35]
City    [New York, Los Angeles, Chicago]
dtype: object


In [7]:
# INDEXING
# Accessing a column
print(df['Name'])

print()

# Accessing an individual element
print(df.at[0, 'Name'])


0     John
1    Alice
2      Bob
Name: Name, dtype: object

John


In [8]:
# Accessing a row by label
print(df.loc[0])
print()
print(df.loc[[0,1]])
print()
print(df.loc[(df.Name == "John")])

print()
# Accessing a row by integer position
print(df.iloc[0])

Name        John
Age           25
City    New York
Name: 0, dtype: object

    Name  Age         City
0   John   25     New York
1  Alice   30  Los Angeles

   Name  Age      City
0  John   25  New York

Name        John
Age           25
City    New York
Name: 0, dtype: object


In [9]:
# loc includes the last element of range
print(df.loc[0:2])

# iloc does not include the last element of range
print(df.iloc[0:2])

    Name  Age         City
0   John   25     New York
1  Alice   30  Los Angeles
2    Bob   35      Chicago
    Name  Age         City
0   John   25     New York
1  Alice   30  Los Angeles


In [10]:
data2 = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df2 = pd.DataFrame(data, index=['a', 'b', 'c'])
# print(df2.iloc['a'])
print(df2.iloc[0])
print(df2.loc['a'])

Name        John
Age           25
City    New York
Name: a, dtype: object
Name        John
Age           25
City    New York
Name: a, dtype: object


In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None


In [12]:
df.describe()

Unnamed: 0,Age
count,3.0
mean,30.0
std,5.0
min,25.0
25%,27.5
50%,30.0
75%,32.5
max,35.0


In [14]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [16]:
df_dropped = df.drop(1)
print(df_dropped)

   Name  Age      City
0  John   25  New York
2   Bob   35   Chicago


In [17]:
df_dropped_col = df.drop('Age', axis=1)
print(df_dropped_col)

    Name         City
0   John     New York
1  Alice  Los Angeles
2    Bob      Chicago


### DATA ALIGNMENT

In [26]:
# Index Alignment
series_a = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
series_b = pd.Series([4, 5, 6], index=['b', 'c', 'd'])
sum_series = series_a + series_b
print(sum_series)


a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64


In [21]:
# Filling Missing Data
sum_series = series_a.add(series_b, fill_value=0)
print(sum_series)

a    1.0
b    6.0
c    8.0
d    6.0
dtype: float64


In [25]:
sum_series.fillna(0, inplace = True)
print(sum_series)

a    0.0
b    6.0
c    8.0
d    0.0
dtype: float64


In [27]:
aligned_s1, aligned_s2 = series_a.align(series_b, fill_value=0)
print(aligned_s1)
print(aligned_s2)

a    1.0
b    2.0
c    3.0
d    0.0
dtype: float64
a    0.0
b    4.0
c    5.0
d    6.0
dtype: float64


### SORTING

In [28]:
new_df = pd.DataFrame({'A': [3, 1, 2]}, index=['b', 'c', 'a'])
sorted_df = new_df.sort_index()
print(sorted_df)

   A
a  2
b  3
c  1


In [29]:
sorted_df = new_df.sort_values(by='A')
print(sorted_df)

   A
c  1
a  2
b  3


In [30]:
sorted_df = new_df.sort_values(by='A', ascending=False)
print(sorted_df)

   A
b  3
a  2
c  1


### **Ranking**

 **Syntax**:
Series.rank(axis=0, method=’average’, numeric_only=None, na_option=’keep’, ascending=True, pct=False)


**Parameter** :

**axis** :  index to direct ranking

**method** : {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}

**numeric_only** :  Include only float, int, boolean data. Valid only for DataFrame or Panel objects

**na_option** :  {‘keep’, ‘top’, ‘bottom’}

**ascending** :  False for ranks by high (1) to low (N)

In [31]:
s = pd.Series([7, 1, 2, 7])
ranked = s.rank()
print(ranked)

0    3.5
1    1.0
2    2.0
3    3.5
dtype: float64


In [34]:
sr = pd.Series([10, 25, 3, 11, 24, 6])
# Create the Index
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp']
# set the index
sr.index = index_
# Print the series
print(sr)
print()

# assign rank
result = sr.rank()
# Print the result
print(result)

Coca Cola    10
Sprite       25
Coke          3
Fanta        11
Dew          24
ThumbsUp      6
dtype: int64

Coca Cola    3.0
Sprite       6.0
Coke         1.0
Fanta        4.0
Dew          5.0
ThumbsUp     2.0
dtype: float64


In [36]:
ranked_max = s.rank(method='max')
print(ranked_max)

0    4.0
1    1.0
2    2.0
3    4.0
dtype: float64


In [38]:
ranked_min = s.rank(method='min')
print(ranked_min)

0    3.0
1    1.0
2    2.0
3    3.0
dtype: float64


In [42]:
ranked_first = s.rank(method='first')
print(ranked_first)

0    3.0
1    1.0
2    2.0
3    4.0
dtype: float64


### MultiIndexing

In [48]:
index = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 1), ('B', 2)], names=['Letter', 'Number'])
df = pd.DataFrame({'Data': [10, 20, 30, 40]}, index=index)
print(df)

               Data
Letter Number      
A      1         10
       2         20
B      1         30
       2         40


In [50]:
index = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 1), ('B', 2)], names=['Letter', 'Number'])
Data = [10, 20, 30, 40]
df = pd.DataFrame(Data, index=index, columns=['Value'])
print(df)

               Value
Letter Number       
A      1          10
       2          20
B      1          30
       2          40


### GroupBy

In [51]:
# Creating pandas dataframe
df = pd.DataFrame(
    [
        ("Corona Positive", 65, 99),
        ("Corona Negative", 52, 98.7),
        ("Corona Positive", 43, 100.1),
        ("Corona Positive", 26, 99.6),
        ("Corona Negative", 30, 98.1),
    ],

    index=["Patient 1", "Patient 2", "Patient 3",
           "Patient 4", "Patient 5"],

    columns=("Status", "Age(in Years)", "Temperature"),
)

# show dataframe
print(df)

                    Status  Age(in Years)  Temperature
Patient 1  Corona Positive             65         99.0
Patient 2  Corona Negative             52         98.7
Patient 3  Corona Positive             43        100.1
Patient 4  Corona Positive             26         99.6
Patient 5  Corona Negative             30         98.1


In [56]:
# Grouping with only status
grouped1 = df.groupby("Status")

# Grouping with temperature and status
grouped3 = df.groupby(["Temperature", "Status"])

# Finding the mean of the patients reports according to the status
grouped1.mean()

Unnamed: 0_level_0,Age(in Years),Temperature
Status,Unnamed: 1_level_1,Unnamed: 2_level_1
Corona Negative,41.0,98.4
Corona Positive,44.666667,99.566667


In [57]:
# Finding the mean of patients reports according to temperature & status
grouped3.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age(in Years)
Temperature,Status,Unnamed: 2_level_1
98.1,Corona Negative,30.0
98.7,Corona Negative,52.0
99.0,Corona Positive,65.0
99.6,Corona Positive,26.0
100.1,Corona Positive,43.0
