In [23]:
# Example 1: Loading a CSV file into a DataFrame

import pandas as pd

# Load a CSV file into a DataFrame
df = pd.read_csv('ait_pg_2024/datasets/cars.csv')

# Display the first few rows of the DataFrame
print(df.head())
# In this example, we use the read_csv function from Pandas to load a CSV file named data.csv into a DataFrame.

    mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0  18.0          8         307.0         130    3504          12.0    70   
1  15.0          8         350.0         165    3693          11.5    70   
2  18.0          8         318.0         150    3436          11.0    70   
3  16.0          8         304.0         150    3433          12.0    70   
4  17.0          8         302.0         140    3449          10.5    70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


In [2]:
# Example 2: Exploring DataFrame Information

# Display the column names and data types
print("Column Names and Data Types:")
print(df.dtypes)

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Display the shape of the DataFrame
print("\nDataFrame Shape:", df.shape)
# Here, we explore information about the DataFrame, such as column names, data types, summary statistics, and the shape of the DataFrame.


Column Names and Data Types:
mpg             float64
cylinders         int64
displacement    float64
horsepower        int64
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

Summary Statistics:
              mpg   cylinders  displacement  horsepower       weight  \
count  392.000000  392.000000    392.000000  392.000000   392.000000   
mean    23.445918    5.471939    194.411990  104.469388  2977.584184   
std      7.805007    1.705783    104.644004   38.491160   849.402560   
min      9.000000    3.000000     68.000000   46.000000  1613.000000   
25%     17.000000    4.000000    105.000000   75.000000  2225.250000   
50%     22.750000    4.000000    151.000000   93.500000  2803.500000   
75%     29.000000    8.000000    275.750000  126.000000  3614.750000   
max     46.600000    8.000000    455.000000  230.000000  5140.000000   

       acceleration        year      origin  
count    392.000000  392.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int64  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
 8   name          392 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 27.7+ KB


In [3]:
# Example 3: Selecting and Filtering Data

# Select a single column
column_data = df['Column_Name']

# Select multiple columns
multiple_columns = df[['Column1', 'Column2']]

# Filter rows based on a condition
filtered_data = df[df['Column_Name'] > 10]
# In this example, we demonstrate how to select specific columns and filter rows based on conditions in the DataFrame.


KeyError: 'Column_Name'

In [6]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')

In [9]:
# Example 3: Selecting and Filtering Data

# Select a single column
column_data = df['mpg']

# Select multiple columns
multiple_columns = df[['mpg', 'cylinders']]

# Filter rows based on a condition
filtered_data = df[df['mpg'] > 20]
# In this example, we demonstrate how to select specific columns and filter rows based on conditions in the DataFrame.

In [10]:
filtered_data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
14,24.0,4,113.0,95,2372,15.0,70,3,toyota corona mark ii
15,22.0,6,198.0,95,2833,15.5,70,1,plymouth duster
17,21.0,6,200.0,85,2587,16.0,70,1,ford maverick
18,27.0,4,97.0,88,2130,14.5,70,3,datsun pl510
19,26.0,4,97.0,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
...,...,...,...,...,...,...,...,...,...
387,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
388,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
389,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
390,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [24]:
# Example 4: Grouping and Aggregating Data

# Group data by a column and calculate mean
mean_by_group = df.groupby('mpg').mean()

# Perform multiple aggregations
agg_results = df.groupby('name').agg({'displacement': 'mean', 'mpg': 'sum'})
# Here, we group the data by a column and perform aggregations such as mean calculation and summing across multiple columns.

TypeError: agg function failed [how->mean,dtype->object]

In [37]:
length = len(df)

df['target'] = ['a' if i < length / 3 else 'b' if i < 2 * length / 3 else 'c' for i in range(length)]

In [40]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

In [42]:
def calculate_mean(col, target):
    mean_val = col.mean()
    return mean_val

In [44]:
from joblib import Parallel, delayed
results = Parallel(n_jobs=-1)(delayed(calculate_mean)(df[col], df['target']) for col in numeric_cols if col != 'target')

In [45]:
results

[23.445918367346938,
 5.471938775510204,
 194.41198979591837,
 104.46938775510205,
 2977.5841836734694,
 15.541326530612244,
 75.9795918367347,
 1.5765306122448979,
 2.2346938775510203]

In [46]:
mean_dict = dict(zip(numeric_cols, results))

In [47]:
mean_dict

{'mpg': 23.445918367346938,
 'cylinders': 5.471938775510204,
 'displacement': 194.41198979591837,
 'horsepower': 104.46938775510205,
 'weight': 2977.5841836734694,
 'acceleration': 15.541326530612244,
 'year': 75.9795918367347,
 'origin': 1.5765306122448979,
 'test': 2.2346938775510203}