In [None]:
# Exploratory Data Analysis (EDA)

#This notebook performs Exploratory Data Analysis (EDA) on the TellCo Customer Analysis dataset. The goal is to understand the data, identify patterns, and prepare it for further analysis and feature engineering.

## 1. Import Libraries

#```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
# Define file paths
excel_file_path = 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/TellCo_Customer_Analysis/data/Field Descriptions.xlsx'
xlsx_data_path = 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/TellCo_Customer_Analysis/data/Week1_challenge_data_source.xlsx'
csv_data_path = 'C:/Users/hayyu.ragea/AppData/Local/Programs/Python/Python312/TellCo_Customer_Analysis/data/Week1_challenge_data_source(CSV).csv'

# Load data
excel_data = pd.read_excel(excel_file_path)
xlsx_data = pd.read_excel(xlsx_data_path)
csv_data = pd.read_csv(csv_data_path)

# Display first few rows of each dataset
excel_data.head(), xlsx_data.head(), csv_data.head()
# Merge datasets if necessary
# For example, if you want to merge CSV and XLSX data on a common column
# merged_data = pd.merge(csv_data, xlsx_data, on='common_column')

# Clean and preprocess data
def clean_data(df):
    df = df.drop_duplicates()
    df = df.dropna()  # Drop rows with missing values for simplicity
    return df

cleaned_data = clean_data(csv_data)  # Apply cleaning to one dataset as an example
# Summary statistics
cleaned_data.describe(include='all')
# Plot distribution of numerical variables
numerical_cols = cleaned_data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    plt.figure(figsize=(10, 4))
    sns.histplot(cleaned_data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()
# Check for missing values
missing_values = cleaned_data.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Detect and handle outliers
def detect_outliers(df, column):
    z_scores = np.abs(stats.zscore(df[column]))
    return df[z_scores > 3]  # Common threshold for outliers

outliers = detect_outliers(cleaned_data, 'some_numerical_column')
print(f"Outliers in 'some_numerical_column':")
print(outliers)
# Correlation matrix
correlation_matrix = cleaned_data.corr()

# Plot correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()
# Standardize features before PCA
features = cleaned_data.select_dtypes(include=['float64', 'int64'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

# Create DataFrame for PCA results
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

# Plot PCA result
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df)
plt.title('PCA Result')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
