In [38]:
import pandas as pd
import numpy as np

df = pd.read_csv('national_parks_with_float_coors.csv')

In [24]:
import pandas as pd
import numpy as np
import ast  # To safely evaluate the string representation of the dictionary

# Assuming df is your DataFrame with a 'weather' column
# Let's check for unique values in the 'weather' column to identify potential issues
print("Unique values in the weather column:")
print(df['Weather'].unique())

# Count the number of NaN and non-string values
nan_count = df['Weather'].isna().sum()
non_string_count = df[~df['Weather'].apply(lambda x: isinstance(x, str))].shape[0]

print(f"Number of NaN values in 'weather': {nan_count}")
print(f"Number of non-string values in 'weather': {non_string_count}")

# Define a function to extract weather features from the 'weather' column
def extract_weather_features(row):
    weather_data_str = row['Weather']
    
    # Check the type of weather_data_str
    if isinstance(weather_data_str, str):
        try:
            # Clean the string if necessary
            weather_data_str = weather_data_str.replace("'", "\"")  # Replace single quotes with double quotes
            
            # Parse the string into a dictionary
            weather_data = ast.literal_eval(weather_data_str)
        except (ValueError, SyntaxError) as e:
            print(f"Error parsing weather data: {e}")
            return pd.Series([np.nan] * 12)  # Return NaN for all features if parsing fails
    else:
        print(f"Weather data is not a string: {type(weather_data_str)}")
        return pd.Series([np.nan] * 12)  # Return NaN for all features if it's not a string

    # Extract weather data, handling missing keys gracefully
    return pd.Series({
        'Temperature': weather_data.get('temp', np.nan),
        'Feelslike': weather_data.get('feelslike', np.nan),
        'Humidity': weather_data.get('humidity', np.nan),
        'Dew': weather_data.get('dew', np.nan),
        'Windspeed': weather_data.get('windspeed', np.nan),
        'Winddir': weather_data.get('winddir', np.nan),
        'Windgust': weather_data.get('windgust', np.nan),
        'Conditions': weather_data.get('conditions', np.nan),
        'Precip': weather_data.get('precip', np.nan),
        'Visibility': weather_data.get('visibility', np.nan),
        'Pressure': weather_data.get('pressure', np.nan),
        'Cloud Cover': weather_data.get('cloudcover', np.nan),
    })

# Apply the function to extract features and concatenate them with the original DataFrame
weather_features = df.apply(extract_weather_features, axis=1)

# Set column names based on the number of features returned
weather_features.columns = ['Temperature', 'Feelslike', 'Humidity', 'Dew', 
                            'Windspeed', 'Winddir', 'Windgust', 
                            'Conditions', 'Precip', 'Visibility', 
                            'Pressure', 'Cloud Cover']

# Concatenate the new features with the original DataFrame
df = pd.concat([df, weather_features], axis=1)

# Display the updated DataFrame
print(df.head())







Unique values in the weather column:
['']
Number of NaN values in 'weather': 0
Number of non-string values in 'weather': 0
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, line 0)
Error parsing weather data: invalid syntax (<unknown>, lin

In [14]:
# Assuming you want to keep all 24 features, create a list with 24 names
weather_features.columns = ['Temperature', 'Feelslike', 'Humidity', 'Dew', 
                            'Windspeed', 'Winddir', 'Windgust', 
                            'Conditions', 'Precip', 'Visibility', 
                            'Pressure', 'Cloud Cover'] + ['Additional Feature ' + str(i) for i in range(12)]  # Example for extra features


In [17]:
def extract_weather_features(row):
    weather_data_str = row['Weather']
    
    # Check the type of weather_data_str
    if isinstance(weather_data_str, str):
        try:
            # Clean the string if necessary
            weather_data_str = weather_data_str.replace("'", "\"")  # Replace single quotes with double quotes
            
            # Parse the string into a dictionary
            weather_data = ast.literal_eval(weather_data_str)
        except (ValueError, SyntaxError) as e:
            print(f"Error parsing weather data: {e}")
            return pd.Series([np.nan] * 12)  # Return NaN for all features if parsing fails
    else:
        print(f"Weather data is not a string: {type(weather_data_str)}")
        return pd.Series([np.nan] * 12)  # Return NaN for all features if it's not a string

    # Extract weather data, handling missing keys gracefully
    return pd.Series({
        'Temperature': weather_data.get('temp', np.nan),
        'Feelslike': weather_data.get('feelslike', np.nan),
        'Humidity': weather_data.get('humidity', np.nan),
        'Dew': weather_data.get('dew', np.nan),
        'Windspeed': weather_data.get('windspeed', np.nan),
        'Winddir': weather_data.get('winddir', np.nan),
        'Windgust': weather_data.get('windgust', np.nan),
        'Conditions': weather_data.get('conditions', np.nan),
        'Precip': weather_data.get('precip', np.nan),
        'Visibility': weather_data.get('visibility', np.nan),
        'Pressure': weather_data.get('pressure', np.nan),
        'Cloud Cover': weather_data.get('cloudcover', np.nan),
    })

# Apply the function to extract features and concatenate them with the original DataFrame
weather_features = df.apply(extract_weather_features, axis=1)

# Check the shape and contents again
print(f"Shape of weather_features: {weather_features.shape}")
print(weather_features.head())

# Set column names based on the actual number of features returned
expected_column_count = 12
if weather_features.shape[1] == expected_column_count:
    weather_features.columns = ['Temperature', 'Feelslike', 'Humidity', 'Dew', 
                                'Windspeed', 'Winddir', 'Windgust', 
                                'Conditions', 'Precip', 'Visibility', 
                                'Pressure', 'Cloud Cover']
else:
    print(f"Expected {expected_column_count} columns, but got {weather_features.shape[1]}")

# Concatenate the new features with the original DataFrame if columns match
if weather_features.shape[1] == expected_column_count:
    df = pd.concat([df, weather_features], axis=1)
    print(df.head())


Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <class 'float'>
Weather data is not a string: <cla

In [26]:
# Convert the 'Weather' column to strings, treating NaN as empty strings
df['Weather'] = df['Weather'].astype(str).replace('nan', '')

# Check the unique values in the 'Weather' column to see if there are any valid entries
unique_weather_entries = df['Weather'].unique()
print("Unique values in the 'Weather' column after conversion:")
print(unique_weather_entries)

# Count valid (non-empty) entries
valid_entries_count = df['Weather'].str.strip().ne('').sum()
print(f"Number of valid weather entries: {valid_entries_count}")



Unique values in the 'Weather' column after conversion:
["{'datetime': '2007-01-01', 'datetimeEpoch': 1167634800, 'tempmax': 0.0, 'tempmin': 0.0, 'temp': None, 'feelslikemax': 0.0, 'feelslikemin': 0.0, 'feelslike': None, 'dew': None, 'humidity': None, 'precip': None, 'precipprob': None, 'precipcover': 0.0, 'preciptype': None, 'snow': None, 'snowdepth': None, 'windgust': None, 'windspeed': None, 'winddir': None, 'pressure': None, 'cloudcover': None, 'visibility': None, 'solarradiation': None, 'solarenergy': None, 'uvindex': None, 'sunrise': '07:39:37', 'sunriseEpoch': 1167662377, 'sunset': '17:16:24', 'sunsetEpoch': 1167696984, 'moonphase': 0.44, 'conditions': '', 'description': '', 'icon': '', 'stations': None, 'hours': [{'datetime': '00:00:00', 'datetimeEpoch': 1167634800, 'temp': None, 'feelslike': None, 'humidity': None, 'dew': None, 'precip': None, 'precipprob': None, 'snow': None, 'snowdepth': None, 'preciptype': None, 'windgust': None, 'windspeed': None, 'winddir': None, 'pressur

In [None]:


# Convert valid weather entries from string to JSON
def parse_weather_data(row):
    try:
        # Replace single quotes with double quotes if necessary
        weather_data_str = row['Weather'].replace("'", "\"")
        return json.loads(weather_data_str)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing weather data: {e}")
        return {}

# Apply the parsing function to the valid weather entries
parsed_weather_data = valid_weather_df.apply(parse_weather_data, axis=1)

# Normalize the parsed weather data into a DataFrame
weather_features_df = pd.json_normalize(parsed_weather_data)

# Display the new DataFrame with weather features
print("Weather features DataFrame:")
print(weather_features_df.head())

# Optionally, concatenate this DataFrame with the original valid entries
final_df = pd.concat([valid_weather_df.reset_index(drop=True), weather_features_df.reset_index(drop=True)], axis=1)

# Display the final DataFrame
print("Final DataFrame with Weather Features:")
print(final_df.head())


Number of valid weather entries: 234
Error parsing weather data: Expecting value: line 1 column 97 (char 96)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 265 (char 264)
Error parsing weather data: Expecting value: line 1 column 326 (char 325)
Error parsing weather data: Expecting value: line 1 column 266 (char 265)
Error parsing weather data: Expecting value: line 1 column 456 (char 455)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Error parsing weather data: Expecting value: line 1 column 323 (char 322)
Error parsing weather data: Expecting value: line 1 column 267 (char 266)
Err

In [46]:
# Ensure the date column is in datetime format
date_column_name = 'your_date_column_name'  # Replace this with the actual name of your date column
final_df['Incident Date'] = pd.to_datetime(final_df['Incident Date'], errors='coerce')

# Extract the year and create a new column
final_df['Year'] = final_df['Incident Date'].dt.year

# Analyze numeric variables, including the new 'Year' column
numeric_summary = final_df.describe().T  # Transpose for better readability

# Add sample size for numeric variables
numeric_summary['Sample Size'] = final_df.count()

# Select the relevant statistics
numeric_summary = numeric_summary[['Sample Size', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
numeric_summary.columns = ['Sample Size', 'Mean', 'Standard Deviation', 'Min', 'Q1', 'Median (Q2)', 'Q3', 'Max']

# Analyze categorical variables, excluding the original date column and including the 'Year' column
categorical_summary = final_df.select_dtypes(include=['object']).describe().T  # Get summary for categorical variables

# Prepare a summary DataFrame for categorical variables without counts
categorical_summary = categorical_summary[['count', 'unique', 'top']]  # Selecting only the relevant columns
categorical_summary.columns = ['Sample Size', 'Unique Categories', 'Top Category']

# Year summary as a separate DataFrame
year_summary = final_df['Year'].value_counts().reset_index()
year_summary.columns = ['Year', 'Counts']

# Save the summaries to separate CSV files
numeric_summary_file = 'numeric_summary.csv'
categorical_summary_file = 'categorical_summary.csv'
year_summary_file = 'year_counts.csv'

# Save numeric summary
numeric_summary.to_csv(numeric_summary_file)
print(f"Numeric summary saved to {numeric_summary_file}")

# Save categorical summary
categorical_summary.to_csv(categorical_summary_file)
print(f"Categorical summary saved to {categorical_summary_file}")

# Save year summary
year_summary.to_csv(year_summary_file)
print(f"Year counts saved to {year_summary_file}")


Numeric summary saved to numeric_summary.csv
Categorical summary saved to categorical_summary.csv
Year counts saved to year_counts.csv


In [34]:
numeric_summary.to_csv("numerics.csv")
categorical_summary.to_csv("categors.csv")