In [None]:
import pandas as pd

# List of CSV file names
csv_files = [
    'Electrical_permits_part_1.csv',
    'Electrical_permits_part_2.csv',
    'Electrical_permits_part_3.csv',
    'Electrical_permits_part_4.csv',
    'Electrical_permits_part_5.csv',
    'Electrical_permits_part_6.csv',
    'Electrical_permits_part_7.csv',
    'Electrical_permits_part_8.csv'
]

# Read and concatenate all the CSV files into one DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('Electrical_permits.csv', index=False)

# Optionally, display the first few rows of the merged DataFrame
print(merged_df.head())


In [None]:
import pandas as pd

df = pd.read_csv('Electrical_permits.csv')

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load your dataset
df = pd.read_csv('Electrical_permits.csv')

# Strip any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

# Print the column names to ensure they are correct
print("Original columns:", df.columns)

# Check for the presence of the predictor columns
if 'PermitClassMapped' not in df.columns or 'PermitTypeMapped' not in df.columns or 'PermitClass' not in df.columns:
    raise KeyError("The columns 'PermitClass', 'PermitClassMapped', or 'PermitTypeMapped' are not found in the DataFrame")

# Extract rows with and without missing values
non_missing_data = df.dropna(subset=['EstProjectCost'])
missing_data = df[df['EstProjectCost'].isnull()]

# Convert categorical variables to numerical using get_dummies
non_missing_data = pd.get_dummies(non_missing_data, columns=['PermitClass', 'PermitClassMapped', 'PermitTypeMapped'])
missing_data = pd.get_dummies(missing_data, columns=['PermitClass', 'PermitClassMapped', 'PermitTypeMapped'])

# Ensure both datasets have the same dummy variable columns
missing_data = missing_data.reindex(columns=non_missing_data.columns, fill_value=0)

# Print the columns after get_dummies to ensure they match
print("Non-missing data columns after get_dummies:", non_missing_data.columns)
print("Missing data columns after reindex:", missing_data.columns)

# Choose predictors - update this list based on the actual dummy columns created
predictors = [col for col in non_missing_data.columns if col.startswith('PermitClass_') or col.startswith('PermitClassMapped_') or col.startswith('PermitTypeMapped_')]

# Train the regression model
model = LinearRegression()
model.fit(non_missing_data[predictors], non_missing_data['EstProjectCost'])

# Predict the missing values
predicted_values = model.predict(missing_data[predictors])

# Fill in the missing values with the predicted values
df.loc[df['EstProjectCost'].isnull(), 'EstProjectCost'] = predicted_values

# Verify if the missing values are filled
print(df['EstProjectCost'].isnull().sum())


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load your dataset
df = pd.read_csv('Electrical_permits.csv')

# Strip any leading or trailing spaces in column names
df.columns = df.columns.str.strip()

# Print the column names to ensure they are correct
print("Original columns:", df.columns)

# Check for the presence of the predictor columns
if 'PermitClassMapped' not in df.columns or 'PermitTypeMapped' not in df.columns or 'PermitClass' not in df.columns:
    raise KeyError("The columns 'PermitClass', 'PermitClassMapped', or 'PermitTypeMapped' are not found in the DataFrame")

# Extract rows with and without missing values in EstProjectCost
non_missing_data = df.dropna(subset=['EstProjectCost'])
missing_data = df[df['EstProjectCost'].isnull()]

# Convert categorical variables to numerical using get_dummies
non_missing_data = pd.get_dummies(non_missing_data, columns=['PermitClass', 'PermitClassMapped', 'PermitTypeMapped'])
missing_data = pd.get_dummies(missing_data, columns=['PermitClass', 'PermitClassMapped', 'PermitTypeMapped'])

# Ensure both datasets have the same dummy variable columns
missing_data = missing_data.reindex(columns=non_missing_data.columns, fill_value=0)

# Print the columns after get_dummies to ensure they match
print("Non-missing data columns after get_dummies:", non_missing_data.columns)
print("Missing data columns after reindex:", missing_data.columns)

# Choose predictors - update this list based on the actual dummy columns created
predictors = [col for col in non_missing_data.columns if col.startswith('PermitClass_') or col.startswith('PermitClassMapped_') or col.startswith('PermitTypeMapped_')]

# Train the regression model
model = LinearRegression()
model.fit(non_missing_data[predictors], non_missing_data['EstProjectCost'])

# Predict the missing values
predicted_values = model.predict(missing_data[predictors])

# Fill in the missing values with the predicted values
df.loc[df['EstProjectCost'].isnull(), 'EstProjectCost'] = predicted_values

# Fill other missing values using backfill or forward fill
df.fillna(method='bfill', inplace=True)
df.fillna(method='ffill', inplace=True)

# Define thresholds
single_family_threshold = 15000000  # 15 million
multi_family_threshold = 50000000   # 50 million

# Check unique values in 'PermitClass' to ensure categories are correct
print("Unique values in 'PermitClass':", df['PermitClass'].unique())

# Print descriptive statistics for 'EstProjectCost'
print("Descriptive statistics for 'EstProjectCost':")
print(df['EstProjectCost'].describe())

# Print the number of rows before filtering
print("Number of rows before filtering:", len(df))

# Filter rows based on the conditions
filtered_df = df[
    ~(
        ((df['PermitClass'] == 'Single Family/Duplex') & (df['EstProjectCost'] > single_family_threshold)) |
        ((df['PermitClass'] == 'MultiFamily') & (df['EstProjectCost'] > multi_family_threshold))
    )
]

# Print the number of rows after filtering
print("Number of rows after filtering:", len(filtered_df))

# Verify if the missing values are filled
print(filtered_df.isnull().sum())

# Save the cleaned dataset to a new CSV file
filtered_df.to_csv('Cleaned_Electrical_permits55.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
print(filtered_df.head(5))


In [None]:


# Convert date columns to datetime with error handling
date_columns = ['AppliedDate', 'IssuedDate', 'ExpiresDate', 'CompletedDate']
for col in date_columns:
    filtered_df[col] = pd.to_datetime(filtered_df[col], errors='coerce')

# Verify if the datetime conversion was successful
print(filtered_df[date_columns].dtypes)




In [None]:
filtered_df['ApplicationToIssueTime'] = (filtered_df['IssuedDate'] - filtered_df['AppliedDate']).dt.days
filtered_df['IssueToCompletionTime'] = (filtered_df['CompletedDate'] - filtered_df['IssuedDate']).dt.days
filtered_df['TotalPermitTime'] = (filtered_df['CompletedDate'] - filtered_df['AppliedDate']).dt.days
filtered_df['AppliedYear'] = filtered_df['AppliedDate'].dt.year
filtered_df['AppliedMonth'] = filtered_df['AppliedDate'].dt.month
filtered_df['AppliedDay'] = filtered_df['AppliedDate'].dt.day
