Data Gathering

In [None]:
# From CSV: 
pd.read_csv('file.csv')

# From Excel: 
pd.read_excel('file.xlsx')

# From SQL Database: 
pd.read_sql(query, connection)

# Web Scrapping
#  BeautifulSoup 

Basic Information

In [None]:
# Display the first few rows of the DataFrame
df.head()

# Display the last few rows of the DataFrame
df.tail()

# Display the shape of the DataFrame (rows, columns)
df.shape

# Display column names and data types
df.info()

# Summary statistics for numerical columns
df.describe()

# Unique values in a column
df['column'].unique()

# Number of unique values in a column
df['column'].nunique()

# Correlation matrix
print(df.corr())

# Distribution of numerical variables
import seaborn as sns
sns.pairplot(df)
plt.show()


Missing Values

In [None]:
# Check for missing values in DataFrame
df.isna() / df.isnull().sum()


Handle missing values

In [None]:
# Drop rows with missing values
df.dropna()

# Fill missing values with a specified value
df.fillna(value)

# Drop rows with missing values
df.dropna(subset=['numerical_column'], inplace=True)
df.dropna(subset=['categorical_column'], inplace=True)  #categorical

# Fill missing values with mean
mean_value = df['numerical_column'].mean()
df['numerical_column'].fillna(mean_value, inplace=True)

# Fill missing values with median
median_value = df['numerical_column'].median()
df['numerical_column'].fillna(median_value, inplace=True)

# Fill missing values with mode
mode_value = df['numerical_column'].mode()[0]  # Handle multiple modes
df['numerical_column'].fillna(mode_value, inplace=True)

mode_value = df['categorical_column'].mode()[0]  #categorical
df['categorical_column'].fillna(mode_value, inplace=True)

# Fill missing values with interpolation
df['numerical_column'].interpolate(method='linear', inplace=True)

# Fill missing values with a placeholder--categorical
df['categorical_column'].fillna('Unknown', inplace=True)


Outlier Detection

In [None]:
# Calculate z-score for a numerical column
from scipy.stats import zscore
z_scores = zscore(df['numerical_column'])

# Detect outliers using z-score
outliers = df[np.abs(z_scores) > 3]

# Visualize outliers using a boxplot
plt.boxplot(df['numerical_column'])
plt.show()

Outlier Handling

In [None]:
# Drop rows with outliers
df_clean = df[np.abs(z_scores) <= 3]

# Log transformation to reduce skewness
df['log_column'] = np.log(df['numerical_column'])

# Square root transformation
df['sqrt_column'] = np.sqrt(df['numerical_column'])

# Box-Cox transformation
from scipy.stats import boxcox
df['boxcox_column'], _ = boxcox(df['numerical_column'])

# Set a threshold for capping
threshold = 3 * df['numerical_column'].std()

# Cap outliers above the threshold
df['capped_column'] = np.where(df['numerical_column'] > threshold, threshold, df['numerical_column'])

# Cap outliers below the threshold
df['capped_column'] = np.where(df['numerical_column'] < -threshold, -threshold, df['numerical_column'])

# Apply winsorization to cap outliers
from scipy.stats.mstats import winsorize
df['winsorized_column'] = winsorize(df['numerical_column'], limits=[0.05, 0.05])

# Fill outliers using linear interpolation
df['interpolated_column'] = df['numerical_column'].interpolate(method='linear')

Feature Engineering

In [None]:
# Create new features (e.g., age groups)
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Senior'])

# Extract information from datetime variables
df['year'] = pd.to_datetime(df['date']).dt.year
df['month'] = pd.to_datetime(df['date']).dt.month


Distribution Analysis

In [None]:
# Histogram for a numerical column
plt.hist(df['column'], bins=10)

# Box plot for a numerical column
plt.boxplot(df['column'])

# Distribution plot for a numerical column
sns.distplot(df['column'])


Relationship Analysis

In [None]:
# Scatter plot between two numerical columns
plt.scatter(df['column1'], df['column2'])

# Pairplot for multiple numerical columns
sns.pairplot(df[['column1', 'column2', 'column3']])

correlation = df.corr() 

# Heatmap for correlation matrix
sns.heatmap(df.corr(), annot=True)

Categorical Variables

In [None]:
# Count plot for a categorical column
sns.countplot(df['category'])

# Box plot for a numerical column by category
sns.boxplot(x='category', y='numerical_column', data=df)

# Cross-tabulation between two categorical columns
pd.crosstab(df['category1'], df['category2'])

 Time Series Analysis

In [None]:
# Convert string to datetime format
df['datetime_column'] = pd.to_datetime(df['datetime_column'])

# Line plot for a time series data
plt.plot(df['datetime_column'], df['value'])

# Resample time series data
df.resample('D').sum()


Encoding Categorical Variables--Label Encoding: Converts categorical labels into numerical representations (0, 1, 2, ...) using the LabelEncoder from scikit-learn. Suitable for ordinal categorical variables with an inherent order.
One-Hot Encoding: Creates binary columns for each category and assigns 1 if the category is present, 0 otherwise. Implemented using pd.get_dummies() in pandas. Suitable for nominal categorical variables without an inherent order.

In [None]:
# Label encoding (for ordinal categorical variables)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['encoded_category'] = label_encoder.fit_transform(df['category_column'])

# One-hot encoding (for nominal categorical variables)
encoded_df = pd.get_dummies(df, columns=['nominal_category_column'], drop_first=True)


Scaling and Normalization of Numerical Features---Min-Max Scaling: Scales numerical features to a specified range (e.g., [0, 1]) using MinMaxScaler from scikit-learn.
Standardization: Scales numerical features to have mean=0 and variance=1 using StandardScaler from scikit-learn.

In [None]:
# Min-Max scaling (scaling features to a range)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['numeric_column_1', 'numeric_column_2']] = scaler.fit_transform(df[['numeric_column_1', 'numeric_column_2']])

# Standardization (scaling features to have mean=0 and variance=1)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['numeric_column_1', 'numeric_column_2']] = scaler.fit_transform(df[['numeric_column_1', 'numeric_column_2']])
