# code for screening and cleaning data
## box plots and corr matrix also found individually below

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset into a pandas DataFrame
data = pd.read_csv('your_data.csv')

# Data Screening

# Display basic information about the dataset
print(data.info())

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Check for duplicate rows
duplicates = data.duplicated().sum()
print("Number of duplicate rows:", duplicates)

# Summary statistics
summary_stats = data.describe()
print(summary_stats)

# Visualize distribution of numeric columns using histograms
data.hist(figsize=(10, 8))
plt.show()

# Visualize correlation between numeric variables using a heatmap
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Data Cleaning

# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Impute missing values with median for numeric columns
numeric_columns = data.select_dtypes(include=np.number).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Handle outliers using IQR method
for column in numeric_columns:
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    data = data[(data[column] >= Q1 - 1.5 * IQR) & (data[column] <= Q3 + 1.5 * IQR)]

# Convert categorical variables to lowercase and remove leading/trailing spaces
data = data.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

# Save the cleaned data
data.to_csv('cleaned_data.csv', index=False)



# pair plot using seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read data from CSV file (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('wine.csv')

# Create a pair plot
sns.pairplot(data)

# Display the plot
plt.show()




# box and whisker plot

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read data from CSV file (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('wine.csv')

# Adjust the figure size and aspect ratio
plt.figure(figsize=(12, 8))  # Increase width and height as needed

# Create box and whisker plots for individual variables
sns.boxplot(data=data, orient='v', width=0.5, linewidth=1.5, boxprops=dict(alpha=0.7))

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Increase font size for labels and title
plt.title('Box and Whisker Plot of Individual Variables', fontsize=16)
plt.xlabel('Variables', fontsize=14)
plt.ylabel('Values', fontsize=14)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
#sharex=True and sharey=True are used to ensure that all subplots share the same x-axis and y-axis, respectively. 
#This makes it easier to compare the data across the subplots.

data.plot(kind='box', subplots=True, layout=(10,6), sharex=False, sharey=False)
plt.show()

# histograms

In [None]:
#simple histogram of individual variables
data.hist()
plt.show()

## this is the same as the first pair plot using seaborn, it renders much faster but is not as neat or easy to read

In [None]:
# scatter plot matrix
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt


# Create a scatter matrix, alpha controls transparency of points in scatter
scatter_matrix(data, alpha=0.5, figsize=(10, 10), diagonal='hist')

# Display the plot
plt.show()

# violin plot
## good for comparing shape,spread, and central tendency of distrib. across multiple categories. good at handling unever data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read data from CSV file (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('abalone.csv')

# Create a violin plot
plt.figure(figsize=(10, 6))  # Adjust the figure size if needed
sns.violinplot(data=data)

# Customize plot labels and appearance
plt.title('Violin Plot')
plt.xlabel('X-axis Label')
plt.ylabel('Y-axis Label')

# Display the plot
plt.tight_layout()
plt.show()


# correlation matrix

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Read data from CSV file (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('wine.csv')

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the figure size if needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)

# Customize plot labels and appearance
plt.title('Correlation Matrix Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)

# Display the plot
plt.tight_layout()
plt.show()


# pie chart

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read data from CSV file (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('wine.csv')

# Group the data by a categorical variable (replace 'category_column' with your column name)
grouped_data = data['Alcohol'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 8))  # Adjust the figure size if needed
plt.pie(grouped_data, labels=grouped_data.index, autopct='%1.1f%%', startangle=140)

# Customize plot labels and appearance
plt.title('Pie Chart of Categories')

# Display the plot
plt.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle.
plt.show()
