In [358]:
import numpy as np
import pandas as pd

# Series
you can convert list, numpy, or dictionary to a Series

In [359]:
#create Series
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])

In [360]:
#Show first three rows
ser1.head(3)

USA        1
Germany    2
USSR       3
dtype: int64

# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

In [361]:
X=np.random.randn(5,4)
X

array([[ 1.80431323,  1.22003466,  0.0880195 , -0.88689207],
       [ 1.11247458, -0.63661567, -0.42948593,  0.88937145],
       [ 0.90242464,  0.67142873,  0.03743395, -0.72553944],
       [-0.07453702,  0.14917675,  0.66342766, -1.35832228],
       [-0.72420332,  1.48396519, -0.35968557,  0.31808379]])

In [362]:
index=['A','B','C','D','E']
index

['A', 'B', 'C', 'D', 'E']

In [363]:
columns=['W','X','Y','Z']
columns

['W', 'X', 'Y', 'Z']

In [364]:
#create DataFrame
df = pd.DataFrame(data=X,index=index,columns=columns)

df.head()

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539
D,-0.074537,0.149177,0.663428,-1.358322
E,-0.724203,1.483965,-0.359686,0.318084


# Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [365]:
df['W']

A    1.804313
B    1.112475
C    0.902425
D   -0.074537
E   -0.724203
Name: W, dtype: float64

In [366]:
# Pass a list of column names
df[['W','Z']]

Unnamed: 0,W,Z
A,1.804313,-0.886892
B,1.112475,0.889371
C,0.902425,-0.725539
D,-0.074537,-1.358322
E,-0.724203,0.318084


### Creating a new column:

In [367]:
df['new'] = df['W'] + df['Y']

df.head()

Unnamed: 0,W,X,Y,Z,new
A,1.804313,1.220035,0.08802,-0.886892,1.892333
B,1.112475,-0.636616,-0.429486,0.889371,0.682989
C,0.902425,0.671429,0.037434,-0.725539,0.939859
D,-0.074537,0.149177,0.663428,-1.358322,0.588891
E,-0.724203,1.483965,-0.359686,0.318084,-1.083889


### Removing Columns

In [368]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539
D,-0.074537,0.149177,0.663428,-1.358322
E,-0.724203,1.483965,-0.359686,0.318084


In [369]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new
A,1.804313,1.220035,0.08802,-0.886892,1.892333
B,1.112475,-0.636616,-0.429486,0.889371,0.682989
C,0.902425,0.671429,0.037434,-0.725539,0.939859
D,-0.074537,0.149177,0.663428,-1.358322,0.588891
E,-0.724203,1.483965,-0.359686,0.318084,-1.083889


In [370]:
df.drop('new',axis=1,inplace=True)

In [371]:
df

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539
D,-0.074537,0.149177,0.663428,-1.358322
E,-0.724203,1.483965,-0.359686,0.318084


In [372]:
X= df.iloc[:,:-1]
y= df.iloc[:,-1]

print(X)

          W         X         Y
A  1.804313  1.220035  0.088020
B  1.112475 -0.636616 -0.429486
C  0.902425  0.671429  0.037434
D -0.074537  0.149177  0.663428
E -0.724203  1.483965 -0.359686


In [373]:
print(y)

A   -0.886892
B    0.889371
C   -0.725539
D   -1.358322
E    0.318084
Name: Z, dtype: float64


Can also drop rows this way:

In [374]:
df.drop('E',axis=0, inplace=True)

### Selecting Rows

In [375]:
#Row of A
df.loc['A']

W    1.804313
X    1.220035
Y    0.088020
Z   -0.886892
Name: A, dtype: float64

##### Or select based off of position instead of label

In [376]:
#Row of index two (الصف الثالث)
df.iloc[2]

W    0.902425
X    0.671429
Y    0.037434
Z   -0.725539
Name: C, dtype: float64

In [377]:
df.iloc[:2,:1]

Unnamed: 0,W
A,1.804313
B,1.112475


### Selecting subset of rows and columns

In [378]:
print(df.loc['B','Y'])

-0.42948592854981044


In [379]:
df.loc[['A','B'],['W','Y']]

Unnamed: 0,W,Y
A,1.804313,0.08802
B,1.112475,-0.429486


# Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [380]:
df

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539
D,-0.074537,0.149177,0.663428,-1.358322


In [381]:
df>0

Unnamed: 0,W,X,Y,Z
A,True,True,True,False
B,True,False,False,True
C,True,True,True,False
D,False,True,True,False


In [382]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,
B,1.112475,,,0.889371
C,0.902425,0.671429,0.037434,
D,,0.149177,0.663428,


In [383]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539


In [384]:
df[df['W']>0]['Y']

A    0.088020
B   -0.429486
C    0.037434
Name: Y, dtype: float64

### For two conditions you can use | and & with parenthesis:

In [385]:
df[(df['W']>0) & (df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z
A,1.804313,1.220035,0.08802,-0.886892
B,1.112475,-0.636616,-0.429486,0.889371
C,0.902425,0.671429,0.037434,-0.725539


# More Index Details

Let's discuss some more features of indexing, including resetting the index or setting it something else. We'll also talk about index hierarchy!

In [386]:
newind = ['CA', 'NY', 'WY', 'OR']
newind

['CA', 'NY', 'WY', 'OR']

In [387]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,1.804313,1.220035,0.08802,-0.886892,CA
B,1.112475,-0.636616,-0.429486,0.889371,NY
C,0.902425,0.671429,0.037434,-0.725539,WY
D,-0.074537,0.149177,0.663428,-1.358322,OR


In [388]:
#Set index
df.set_index('States',inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1.804313,1.220035,0.08802,-0.886892
NY,1.112475,-0.636616,-0.429486,0.889371
WY,0.902425,0.671429,0.037434,-0.725539
OR,-0.074537,0.149177,0.663428,-1.358322


# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [389]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [390]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [391]:
df.isna().sum()

A    1
B    2
C    0
dtype: int64

In [392]:
df.dropna(inplace=True)

In [393]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [394]:
df.dropna() #inplace=True

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [395]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [396]:
df.reset_index(drop=True,inplace=True)

In [397]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [398]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [399]:
df.dropna(axis=1)

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [400]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [401]:
df['A'].fillna(value=df['A'].mean())

0    1.0
Name: A, dtype: float64

# duplicated

Let's show a few convenient methods to deal with duplicated Data in pandas:

In [402]:
import pandas as pd
df = pd.DataFrame({'A':[1,1,15,1],
                  'B':[2,2,11,2],
                  'C':[3,3,7,2],
                  'C':[3,3,7,3]})

df

Unnamed: 0,A,B,C
0,1,2,3
1,1,2,3
2,15,11,7
3,1,2,3


In [403]:
df.duplicated()

0    False
1     True
2    False
3     True
dtype: bool

In [404]:
df.duplicated().sum()

2

In [405]:
df.drop_duplicates()

Unnamed: 0,A,B,C
0,1,2,3
2,15,11,7


# Operations

There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show them here in this lecture:

In [406]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4],
                   'col2':[444,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### Info on Unique Values

In [407]:
df['col2'].unique()

array([444, 555, 666], dtype=int64)

In [408]:
df['col2'].nunique()

3

In [409]:
df['col2'].value_counts()

444    2
555    1
666    1
Name: col2, dtype: int64

In [410]:
df['col2'].value_counts().sort_values()

555    1
666    1
444    2
Name: col2, dtype: int64

# Applying Functions

In [411]:
def times2(x):
    return x*2

In [412]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

In [413]:
df['col1'].apply(lambda x:x*2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

### Get column and index names:

In [414]:
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [415]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [416]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    4 non-null      int64 
 1   col2    4 non-null      int64 
 2   col3    4 non-null      object
dtypes: int64(2), object(1)
memory usage: 224.0+ bytes


In [417]:
df.describe()

Unnamed: 0,col1,col2
count,4.0,4.0
mean,2.5,527.25
std,1.290994,106.274409
min,1.0,444.0
25%,1.75,444.0
50%,2.5,499.5
75%,3.25,582.75
max,4.0,666.0


### Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [418]:
import pandas as pd

# Create the DataFrame
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
        'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
        'Sales': [200, 120, 340, 124, 243, 350]}

df = pd.DataFrame(data)

print(df)

# Group by 'Company' and find the maximum sales value
company_sales_max = df.loc[df.groupby('Company')['Sales'].idxmax()]
print(company_sales_max)

  Company   Person  Sales
0    GOOG      Sam    200
1    GOOG  Charlie    120
2    MSFT      Amy    340
3    MSFT  Vanessa    124
4      FB     Carl    243
5      FB    Sarah    350
  Company Person  Sales
5      FB  Sarah    350
0    GOOG    Sam    200
2    MSFT    Amy    340


In [419]:
# Group by 'Company' and find the maximum sales value
company_sales_max = df.loc[df.groupby('Company')['Sales'].idxmin()]
print(company_sales_max)

  Company   Person  Sales
4      FB     Carl    243
1    GOOG  Charlie    120
3    MSFT  Vanessa    124


In [420]:
df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


#### Example 1: Group by Company and Calculate Sum of Sales
You can use the groupby() function along with an aggregation function like sum() to group the data by company and calculate the total sales for each company.

In [421]:
# Group by 'Company' and sum the sales
company_sales_sum = df.groupby('Company').sum()
print(company_sales_sum)


         Sales
Company       
FB         593
GOOG       320
MSFT       464


#### Example 2: Group by Company and Calculate Mean of Sales
You can also calculate the average sales for each company.

In [422]:
# Group by 'Company' and calculate mean sales
company_sales_mean = df.groupby('Company')['Sales'].mean()
print(company_sales_mean)


Company
FB      296.5
GOOG    160.0
MSFT    232.0
Name: Sales, dtype: float64


#### Example 3: Group by Company and Get Count of Entries
If you want to count how many entries (i.e., how many people) are there for each company, you can use the count() function.

In [423]:
# Group by 'Company' and count the number of entries
company_sales_count = df.groupby('Company').count()
print(company_sales_count)


         Person  Sales
Company               
FB            2      2
GOOG          2      2
MSFT          2      2


#### Example 4: Group by Company and Get Maximum Sales
You can also use the max() function to get the maximum sales for each company.

In [424]:
# Group by 'Company' and find the maximum sales value
company_sales_max = df.groupby('Company')['Sales'].max()
print(company_sales_max)


Company
FB      350
GOOG    200
MSFT    340
Name: Sales, dtype: int64


#### Example 5: Group by Company and Apply Multiple Aggregation Functions
You can apply multiple aggregation functions at once using the agg() function.

In [425]:
# Group by 'Company' and apply multiple aggregation functions only to the 'Sales' column
company_sales_agg = df.groupby('Company')['Sales'].agg(['sum', 'mean', 'max'])
print(company_sales_agg)


         sum   mean  max
Company                 
FB       593  296.5  350
GOOG     320  160.0  200
MSFT     464  232.0  340


In [426]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [427]:
df.groupby('Company').describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [428]:
df.groupby('Company').describe().transpose()['GOOG']

Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: GOOG, dtype: float64

#### Concatenation

Concatenation basically glues together DataFrames. Keep in mind that dimensions should match along the axis you are concatenating on. You can use **pd.concat** and pass in a list of DataFrames to concatenate together:

In [429]:
import pandas as pd

In [430]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [431]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])

In [432]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8, 9, 10, 11])

In [433]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [434]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [435]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [436]:
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [437]:
w = pd.concat([df1,df2,df3],axis=1)

In [438]:
w

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,A0,B0,C0,D0,,,,,,,,
1,A1,B1,C1,D1,,,,,,,,
2,A2,B2,C2,D2,,,,,,,,
3,A3,B3,C3,D3,,,,,,,,
4,,,,,A4,B4,C4,D4,,,,
5,,,,,A5,B5,C5,D5,,,,
6,,,,,A6,B6,C6,D6,,,,
7,,,,,A7,B7,C7,D7,,,,
8,,,,,,,,,A8,B8,C8,D8
9,,,,,,,,,A9,B9,C9,D9


In [439]:
w['A']

Unnamed: 0,A,A.1,A.2
0,A0,,
1,A1,,
2,A2,,
3,A3,,
4,,A4,
5,,A5,
6,,A6,
7,,A7,
8,,,A8
9,,,A9


## Select data type

In [440]:
import pandas as pd

# Sample DataFrame with different data types
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [24, 27, 22, 32],
    'height': [5.5, 6.0, 5.8, 5.7],
    'city': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
    'is_student': [True, False, True, False]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,age,height,city,is_student
0,Alice,24,5.5,New York,True
1,Bob,27,6.0,Los Angeles,False
2,Charlie,22,5.8,Chicago,True
3,David,32,5.7,Houston,False


In [441]:
# 1. Selecting only numeric columns
numeric_df = df.select_dtypes(include='number')
print("Numeric columns:\n", numeric_df)


Numeric columns:
    age  height
0   24     5.5
1   27     6.0
2   22     5.8
3   32     5.7


In [442]:
# 2. Selecting only object (string) columns
object_df = df.select_dtypes(include='object')
print("\nObject columns:\n", object_df)


Object columns:
       name         city
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston


In [443]:
# 3. Excluding boolean columns
no_bool_df = df.select_dtypes(exclude='bool')
print("\nDataFrame without boolean columns:\n", no_bool_df)


DataFrame without boolean columns:
       name  age  height         city
0    Alice   24     5.5     New York
1      Bob   27     6.0  Los Angeles
2  Charlie   22     5.8      Chicago
3    David   32     5.7      Houston


In [444]:
# 4. Selecting only numeric columns
numeric_object_df = df.select_dtypes(include=['number','object'])
print("Numeric object columns:\n", numeric_object_df)


Numeric object columns:
       name  age  height         city
0    Alice   24     5.5     New York
1      Bob   27     6.0  Los Angeles
2  Charlie   22     5.8      Chicago
3    David   32     5.7      Houston


## Change type column

In [445]:
# Sample DataFrame
data = {'age': [24.0, 27.0, 22.0, 32.0]}
df = pd.DataFrame(data)

# Convert 'age' column to integer type
df['age'] = df['age'].astype(int)
print(df.dtypes)


age    int32
dtype: object


In [446]:
df['age'] = df['age'].astype(str)
print(df.dtypes)


age    object
dtype: object


##  Replace Function

In [447]:
# Sample DataFrame
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
        'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)

# Replace 'New York' with 'NYC' in the 'city' column
df['city'] = df['city'].replace('New York', 'NYC')
print(df)


      name         city
0    Alice          NYC
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston


In [448]:
# Replace 'New York' with 'NYC' and 'Los Angeles' with 'LA'
df['city'] = df['city'].replace({'New York': 'NYC', 'Los Angeles': 'LA'})
print(df)


      name     city
0    Alice      NYC
1      Bob       LA
2  Charlie  Chicago
3    David  Houston


In [449]:
# Replace both 'New York' and 'Los Angeles' with 'Unknown'
df['city'] = df['city'].replace(['New York', 'Los Angeles'], 'Unknown')
print(df)


      name     city
0    Alice      NYC
1      Bob       LA
2  Charlie  Chicago
3    David  Houston


In [450]:
# Replace 'Alice' with 'Alicia' and 'David' with 'Dave' across the DataFrame
df = df.replace({'Alice': 'Alicia', 'David': 'Dave'})
print(df)


      name     city
0   Alicia      NYC
1      Bob       LA
2  Charlie  Chicago
3     Dave  Houston


In [451]:
# Replace any name starting with 'A' with 'Starts with A'
df['name'] = df['name'].replace(r'^A.*', 'Starts with A', regex=True)
print(df)


            name     city
0  Starts with A      NYC
1            Bob       LA
2        Charlie  Chicago
3           Dave  Houston


## Encoding

In [452]:
# Sample data
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Gender': ['Female', 'Male', 'Male'],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert categorical columns to dummy variables
df_dummies = pd.get_dummies(df, columns=['Gender', 'City'])

df_dummies


Unnamed: 0,Name,Gender_Female,Gender_Male,City_Chicago,City_Los Angeles,City_New York
0,Alice,1,0,0,0,1
1,Bob,0,1,0,1,0
2,Charlie,0,1,1,0,0


# Read and Save data

This notebook is the reference code for getting Read and Save, pandas can read a variety of file types using its pd.read_ methods. Let's take a look at the most common data types:

In [453]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,rank,state,state_code,2020_census,percent_of_total
0,1,California,CA,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


In [454]:
df = pd.read_csv('data.csv',sep=';')  #delimiter but dose n`t have default value
df.head()

Unnamed: 0,"rank,state,state_code,2020_census,percent_of_total"
0,"1,California,CA,39538223,0.1191"
1,"2,Texas,TX,29145505,0.0874"
2,"3,Florida,FL,21538187,0.0647"
3,"4,,NY,,0.0586"
4,"5,Pennsylvania,PA,13002700,0.0386"


In [455]:
df = pd.read_csv('data.csv',na_values='CA')
df.head()

Unnamed: 0,rank,state,state_code,2020_census,percent_of_total
0,1,California,,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


In [456]:
df = pd.read_csv('data.csv',index_col=['rank'])
df.head()

Unnamed: 0_level_0,state,state_code,2020_census,percent_of_total
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,California,CA,39538223.0,0.1191
2,Texas,TX,29145505.0,0.0874
3,Florida,FL,21538187.0,0.0647
4,,NY,,0.0586
5,Pennsylvania,PA,13002700.0,0.0386


In [486]:
df = pd.read_csv('data.csv',header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,rank,state,state_code,2020_census,percent_of_total
1,1,California,CA,39538223,0.1191
2,2,Texas,TX,29145505,0.0874
3,3,Florida,FL,21538187,0.0647
4,4,,NY,,0.0586


In [None]:
#The header=0 argument in pd.read_csv() tells pandas that the first row of the CSV file (index 0) contains the column names.
df = pd.read_csv('data.csv',header=0,names=['rank','St','St_co','2020_cen','Per_o_to'])
df.head()

Unnamed: 0,rank,St,St_co,2020_cen,Per_o_to
0,1,California,CA,39538223.0,0.1191
1,2,Texas,TX,29145505.0,0.0874
2,3,Florida,FL,21538187.0,0.0647
3,4,,NY,,0.0586
4,5,Pennsylvania,PA,13002700.0,0.0386


In [488]:
df = pd.read_csv('data.csv',nrows=3)
df.head()

Unnamed: 0,rank,state,state_code,2020_census,percent_of_total
0,1,California,CA,39538223,0.1191
1,2,Texas,TX,29145505,0.0874
2,3,Florida,FL,21538187,0.0647


In [489]:
df = pd.read_csv('data.csv',skiprows=3)
df.head()

Unnamed: 0,3,Florida,FL,21538187,0.0647
0,4,,NY,,0.0586
1,5,Pennsylvania,PA,13002700.0,0.0386
2,6,,IL,,0.0382
3,7,Ohio,OH,11799448.0,
4,8,Georgia,GA,10711908.0,0.032


# Saving Csv

In [2]:
import pandas as pd
df = pd.read_csv('data.csv')
df.to_csv('output.csv',index=False)