# Pittsburgh Wind Data Analysis
## Initial Steps

In [2]:
import pandas as pd
import altair as alt

# Prevent Altair from creating JSON files - use inline data instead
alt.data_transformers.enable('default')

df = pd.read_csv("Pittsburgh_hourly_normals_1981_2010.csv")

print(f"Shape: {df.shape}")

df.head()

Shape: (8760, 58)


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,HLY-CLDH-NORMAL,HLY-CLDH-NORMAL_ATTRIBUTES,HLY-CLOD-PCTBKN,HLY-CLOD-PCTBKN_ATTRIBUTES,...,HLY-WIND-2NDPCT,HLY-WIND-2NDPCT_ATTRIBUTES,HLY-WIND-AVGSPD,HLY-WIND-AVGSPD_ATTRIBUTES,HLY-WIND-PCTCLM,HLY-WIND-PCTCLM_ATTRIBUTES,HLY-WIND-VCTDIR,HLY-WIND-VCTDIR_ATTRIBUTES,HLY-WIND-VCTSPD,HLY-WIND-VCTSPD_ATTRIBUTES
0,USW00094823,01-01T00:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",-7777,P,101,C,...,194,S,85,S,89,S,252,S,42,S
1,USW00094823,01-01T01:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",-7777,C,86,C,...,178,C,83,C,96,C,252,C,42,C
2,USW00094823,01-01T02:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,C,103,C,...,191,C,82,C,122,C,252,C,41,C
3,USW00094823,01-01T03:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,C,81,C,...,194,C,80,C,122,C,251,C,40,C
4,USW00094823,01-01T04:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,C,90,C,...,210,C,81,C,102,C,252,C,40,C


In [3]:
df.columns.tolist()

['STATION',
 'DATE',
 'LATITUDE',
 'LONGITUDE',
 'ELEVATION',
 'NAME',
 'HLY-CLDH-NORMAL',
 'HLY-CLDH-NORMAL_ATTRIBUTES',
 'HLY-CLOD-PCTBKN',
 'HLY-CLOD-PCTBKN_ATTRIBUTES',
 'HLY-CLOD-PCTCLR',
 'HLY-CLOD-PCTCLR_ATTRIBUTES',
 'HLY-CLOD-PCTFEW',
 'HLY-CLOD-PCTFEW_ATTRIBUTES',
 'HLY-CLOD-PCTOVC',
 'HLY-CLOD-PCTOVC_ATTRIBUTES',
 'HLY-CLOD-PCTSCT',
 'HLY-CLOD-PCTSCT_ATTRIBUTES',
 'HLY-DEWP-10PCTL',
 'HLY-DEWP-10PCTL_ATTRIBUTES',
 'HLY-DEWP-90PCTL',
 'HLY-DEWP-90PCTL_ATTRIBUTES',
 'HLY-DEWP-NORMAL',
 'HLY-DEWP-NORMAL_ATTRIBUTES',
 'HLY-HIDX-NORMAL',
 'HLY-HIDX-NORMAL_ATTRIBUTES',
 'HLY-HTDH-NORMAL',
 'HLY-HTDH-NORMAL_ATTRIBUTES',
 'HLY-PRES-10PCTL',
 'HLY-PRES-10PCTL_ATTRIBUTES',
 'HLY-PRES-90PCTL',
 'HLY-PRES-90PCTL_ATTRIBUTES',
 'HLY-PRES-NORMAL',
 'HLY-PRES-NORMAL_ATTRIBUTES',
 'HLY-TEMP-10PCTL',
 'HLY-TEMP-10PCTL_ATTRIBUTES',
 'HLY-TEMP-90PCTL',
 'HLY-TEMP-90PCTL_ATTRIBUTES',
 'HLY-TEMP-NORMAL',
 'HLY-TEMP-NORMAL_ATTRIBUTES',
 'HLY-WCHL-NORMAL',
 'HLY-WCHL-NORMAL_ATTRIBUTES',
 'HLY-WIND-

In [4]:
# Drop all columns that are attributes
attributes_columns = [col for col in df.columns if 'attributes' in col.lower()]
print(f"Removing {len(attributes_columns)} columns")

df_cleaned = df.drop(columns=attributes_columns)
print(f"DataFrame shape: {df_cleaned.shape}")

df = df_cleaned

# Focus on wind columns
wind_columns = [col for col in df.columns if 'WIND' in col]
print(f"\nWind-related columns ({len(wind_columns)}):")
for col in wind_columns:
    print(f"  - {col}")

# Examine basic wind statistics
print("\nWind data summary:")
print(df[wind_columns].describe())

Removing 26 columns
DataFrame shape: (8760, 32)

Wind-related columns (8):
  - HLY-WIND-1STDIR
  - HLY-WIND-1STPCT
  - HLY-WIND-2NDDIR
  - HLY-WIND-2NDPCT
  - HLY-WIND-AVGSPD
  - HLY-WIND-PCTCLM
  - HLY-WIND-VCTDIR
  - HLY-WIND-VCTSPD

Wind data summary:
       HLY-WIND-1STDIR  HLY-WIND-1STPCT  HLY-WIND-2NDDIR  HLY-WIND-2NDPCT  \
count      8760.000000      8760.000000      8760.000000      8760.000000   
mean          6.348174       205.657078         6.225799       170.700228   
std           1.367510        45.294276         1.433592        37.969046   
min           1.000000        97.000000         1.000000        90.000000   
25%           6.000000       168.000000         6.000000       139.000000   
50%           7.000000       207.000000         6.000000       169.000000   
75%           7.000000       243.000000         7.000000       198.000000   
max           8.000000       328.000000         8.000000       281.000000   

       HLY-WIND-AVGSPD  HLY-WIND-PCTCLM  HLY-WIND-V

In [5]:
# Unit Conversion for Wind Data
# Speeds are 10 times higher than expected
speed_columns = ['HLY-WIND-AVGSPD', 'HLY-WIND-VCTSPD']
for col in speed_columns:
    df[col] = df[col] / 10.0

# Percentages are also 10 times higher than expected
percentage_columns = ['HLY-WIND-1STPCT', 'HLY-WIND-2NDPCT', 'HLY-WIND-PCTCLM']
for col in percentage_columns:
    df[col] = df[col] / 10.0

print("\nUpdated speed statistics:")
print(df[speed_columns].describe())
print("\nUpdated percentage statistics:")
print(df[percentage_columns].describe())


Updated speed statistics:
       HLY-WIND-AVGSPD  HLY-WIND-VCTSPD
count      8760.000000      8760.000000
mean          8.089954         3.079132
std           2.291649         1.691044
min           3.400000         0.000000
25%           6.300000         1.500000
50%           8.300000         3.100000
75%           9.700000         4.300000
max          13.300000         7.200000

Updated percentage statistics:
       HLY-WIND-1STPCT  HLY-WIND-2NDPCT  HLY-WIND-PCTCLM
count      8760.000000      8760.000000      8760.000000
mean         20.565708        17.070023        12.452066
std           4.529428         3.796905        10.655782
min           9.700000         9.000000         0.700000
25%          16.800000        13.900000         3.600000
50%          20.700000        16.900000         8.700000
75%          24.300000        19.800000        17.800000
max          32.800000        28.100000        41.100000


In [6]:
# Add date and season columns
df['DATE'] = pd.to_datetime('2000-' + df['DATE'].astype(str))
df['Month'] = df['DATE'].dt.month
df['Hour'] = df['DATE'].dt.hour
df['Season'] = df['Month'].map({12: 'Winter', 1: 'Winter', 2: 'Winter',
                                3: 'Spring', 4: 'Spring', 5: 'Spring',
                                6: 'Summer', 7: 'Summer', 8: 'Summer',
                                9: 'Fall', 10: 'Fall', 11: 'Fall'})

# Map the values 1-8 to cardinal directions
def find_cardinal_direction(direction_value):
        direction_map = {1: 'N', 2: 'NE', 3: 'E', 4: 'SE', 5: 'S', 6: 'SW', 7: 'W', 8: 'NW'}
        return direction_map.get(int(direction_value))

# Add wind direction columns
df['Primary_Wind_Direction'] = df['HLY-WIND-1STDIR'].apply(find_cardinal_direction)
df['Secondary_Wind_Direction'] = df['HLY-WIND-2NDDIR'].apply(find_cardinal_direction)

# Remove numeric direction columns
df = df.drop(columns=['HLY-WIND-1STDIR', 'HLY-WIND-2NDDIR'])

print(f"Primary wind direction distribution: {df['Primary_Wind_Direction'].value_counts().to_dict()}")
print(f"Secondary wind direction distribution: {df['Secondary_Wind_Direction'].value_counts().to_dict()}")

Primary wind direction distribution: {'W': 4461, 'SW': 2956, 'NW': 557, 'N': 393, 'S': 242, 'SE': 136, 'E': 14, 'NE': 1}
Secondary wind direction distribution: {'SW': 3554, 'W': 2733, 'NW': 1200, 'SE': 439, 'S': 432, 'N': 318, 'E': 81, 'NE': 3}


In [7]:
df.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,HLY-CLDH-NORMAL,HLY-CLOD-PCTBKN,HLY-CLOD-PCTCLR,HLY-CLOD-PCTFEW,...,HLY-WIND-2NDPCT,HLY-WIND-AVGSPD,HLY-WIND-PCTCLM,HLY-WIND-VCTDIR,HLY-WIND-VCTSPD,Month,Hour,Season,Primary_Wind_Direction,Secondary_Wind_Direction
0,USW00094823,2000-01-01 00:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",-7777,101,121,87,...,19.4,8.5,8.9,252,4.2,1,0,Winter,W,SW
1,USW00094823,2000-01-01 01:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",-7777,86,109,86,...,17.8,8.3,9.6,252,4.2,1,1,Winter,W,SW
2,USW00094823,2000-01-01 02:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,103,119,99,...,19.1,8.2,12.2,252,4.1,1,2,Winter,W,SW
3,USW00094823,2000-01-01 03:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,81,121,83,...,19.4,8.0,12.2,251,4.0,1,3,Winter,W,SW
4,USW00094823,2000-01-01 04:00:00,40.4846,-80.2144,366.7,"PITTSBURGH ASOS, PA US",0,90,128,65,...,21.0,8.1,10.2,252,4.0,1,4,Winter,W,SW


## Wind Speed Analysis

How do wind speeds vary seasonally and hourly?


In [8]:
# Finding monthly wind speed patterns
monthly_wind = df.groupby('Month').agg({
    'HLY-WIND-AVGSPD': 'mean',
    'HLY-WIND-VCTSPD': 'mean'
}).reset_index()

monthly_wind['Month_Name'] = monthly_wind['Month'].map({
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
})

# Creating monthly wind speed chart
monthly_chart = alt.Chart(monthly_wind).mark_line(point=True).encode(
    x=alt.X('Month', title='Month'),
    y=alt.Y('HLY-WIND-AVGSPD', title='Average Wind Speed (mph)'),
).properties(
    width=500,
    title='Monthly Average Wind Speeds'
)

monthly_chart

In [19]:
# Finding hourly wind speed patterns
hourly_wind = df.groupby('Hour')['HLY-WIND-AVGSPD'].mean().reset_index()

# Creating hourly wind speed chart
hourly_chart = alt.Chart(hourly_wind).mark_area(
    line={'color': 'steelblue', 'strokeWidth': 2},
    color=alt.Gradient(
        gradient='linear',
        stops=[alt.GradientStop(color='lightblue', offset=0),
               alt.GradientStop(color='steelblue', offset=1)],
        x1=1, y1=1, x2=1, y2=0
    )
).encode(
    x=alt.X('Hour:O', title='Hour of Day'),
    y=alt.Y('HLY-WIND-AVGSPD:Q', title='Average Wind Speed (mph)')
).properties(
    width=500,
    height=300,
    title='Average Wind Speed by Hour (1981-2010)'
)

hourly_chart

## Wind Direction Analysis
What are the dominant wind directions and seasonal patterns?

In [123]:
# Finding wind direction frequency
direction_counts = df['Primary_Wind_Direction'].value_counts().reset_index()
direction_counts.columns = ['Direction', 'Count']
direction_counts['Percentage'] = (direction_counts['Count'] / direction_counts['Count'].sum()) * 100

# Order directions clockwise starting from North
direction_order = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']

# Create a properly ordered DataFrame
ordered_data = []
for direction in direction_order:
    if direction in direction_counts['Direction'].values:
        row = direction_counts[direction_counts['Direction'] == direction].iloc[0]
        ordered_data.append(row)

direction_counts_ordered = pd.DataFrame(ordered_data)

# Create wind distribution pie chart
wind_distribution_pie = alt.Chart(direction_counts_ordered).mark_arc().encode(
    theta=alt.Theta('Count:Q'),
    color=alt.Color('Direction:N', 
                    scale=alt.Scale(scheme='category20'),
                    sort=direction_order,
                    legend=alt.Legend(title="Wind Direction"))
).properties(
    title='Wind Direction Distribution'
)

wind_distribution_pie

In [32]:
# Seasonal wind direction analysis
seasonal_directions = df.groupby(['Season', 'Primary_Wind_Direction']).size().reset_index(name='Count')

# Calculate percentages within each season
seasonal_directions['Percentage'] = seasonal_directions.groupby('Season')['Count'].transform(
    lambda x: (x / x.sum()) * 100
)

# Create seasonal wind direction chart
seasonal_chart = alt.Chart(seasonal_directions).mark_bar().encode(
    x=alt.X('Primary_Wind_Direction:N', 
            title='Wind Direction',
            sort=direction_order),
    y=alt.Y('Percentage:Q', title='Percentage of Time'),
    color=alt.Color('Season:N', scale=alt.Scale(scheme='category10')),
    column=alt.Column('Season:N', title='Season'),
    tooltip=['Season:N', 'Primary_Wind_Direction:N', 'Percentage:Q']
).properties(
    width=150,
    height=300,
    title='Seasonal Wind Direction Patterns'
).resolve_scale(
    y='independent'
)

seasonal_chart

## Conditions Analysis
Examining when Pittsburgh experiences low or high wind conditions.

In [47]:
# Question 3: How do calm conditions vary throughout the year?

# Analyze calm percentage data and wind speed distributions
calm_analysis = df.groupby(['Month', 'Season']).agg({
    'HLY-WIND-PCTCLM': 'mean',  # Percentage of calm conditions
    'HLY-WIND-AVGSPD': ['mean', 'std', 'min', 'max']
}).reset_index()

# Flatten column names
calm_analysis.columns = ['Month', 'Season', 'Calm_Percentage', 'Avg_Speed_Mean', 
                        'Avg_Speed_Std', 'Avg_Speed_Min', 'Avg_Speed_Max']

# Add month names
calm_analysis['Month_Name'] = calm_analysis['Month'].map({
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
})

# Create calm conditions chart
calm_chart = alt.Chart(calm_analysis).mark_bar().encode(
    x=alt.X('Month:O', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('Calm_Percentage:Q', title='Percentage of Calm Conditions (%)'),
    color=alt.Color('Season:N', scale=alt.Scale(scheme='category10')),
    tooltip=['Month_Name:N', 'Season:N', 'Calm_Percentage:Q', 'Avg_Speed_Mean:Q']
).properties(
    width=600,
    height=300,
    title='Monthly Calm Wind Conditions in Pittsburgh'
)

calm_chart

In [34]:
# Wind speed distribution analysis
wind_speed_dist = df[['Month', 'Season', 'HLY-WIND-AVGSPD']].copy()
wind_speed_dist['Speed_Category'] = pd.cut(wind_speed_dist['HLY-WIND-AVGSPD'], 
                                          bins=[0, 3, 7, 12, 18, 25, float('inf')],
                                          labels=['Calm (0-3)', 'Light (3-7)', 'Gentle (7-12)', 
                                                 'Moderate (12-18)', 'Fresh (18-25)', 'Strong (25+)'])

# Create wind speed distribution heatmap
speed_dist = wind_speed_dist.groupby(['Month', 'Speed_Category']).size().reset_index(name='Count')
speed_dist['Percentage'] = speed_dist.groupby('Month')['Count'].transform(lambda x: (x / x.sum()) * 100)

speed_heatmap = alt.Chart(speed_dist).mark_rect().encode(
    x=alt.X('Month:O', title='Month'),
    y=alt.Y('Speed_Category:N', title='Wind Speed Category (mph)'),
    color=alt.Color('Percentage:Q', 
                    scale=alt.Scale(scheme='blues'),
                    title='Percentage of Time'),
    tooltip=['Month:O', 'Speed_Category:N', 'Percentage:Q']
).properties(
    width=500,
    height=200,
    title='Wind Speed Distribution Throughout the Year'
)

speed_heatmap

  speed_dist = wind_speed_dist.groupby(['Month', 'Speed_Category']).size().reset_index(name='Count')
