# Read txt and convert to dtaframe with DOY

In [1]:
import requests
import json
import os
from datetime import datetime
import re
import glob
import numpy as np
import pandas as pd

In [69]:
year = 2023

In [70]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Planted/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_planted = df_sorted.iloc[:, [0, -1]]
df_planted.head()

Unnamed: 0,State,mean
0,Colorado,131
1,Illinois,127
2,Indiana,131
3,Iowa,127
4,Kansas,124


In [71]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Emerged/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_emerged = df_sorted.iloc[:, [0, -1]]
df_emerged.head()

Unnamed: 0,State,mean
0,Colorado,152
1,Illinois,145
2,Indiana,145
3,Iowa,145
4,Kansas,141


In [72]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Silking/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_silking = df_sorted.iloc[:, [0, -1]]
df_silking.head()

Unnamed: 0,State,mean
0,Colorado,211
1,Illinois,201
2,Indiana,197
3,Iowa,201
4,Kansas,201


In [73]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Dough/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_dough = df_sorted.iloc[:, [0, -1]]
df_dough.head()

Unnamed: 0,State,mean
0,Colorado,232
1,Illinois,222
2,Indiana,225
3,Iowa,222
4,Kansas,222


In [74]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Dented/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_dented = df_sorted.iloc[:, [0, -1]]
df_dented.head()

Unnamed: 0,State,mean
0,Colorado,243
1,Illinois,243
2,Indiana,246
3,Iowa,239
4,Kansas,243


In [75]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Mature/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_mature = df_sorted.iloc[:, [0, -1]]
df_mature.head()

Unnamed: 0,State,mean
0,Colorado,260
1,Illinois,260
2,Indiana,264
3,Iowa,260
4,Kansas,257


In [76]:
folder_path = f'/content/drive/MyDrive/Lab/CPR/{year}/filtered/Harvested/'
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().strip().split('\n')

    header_line = next(line for line in lines if "Week ending" in line)
    headers = [header.strip() for header in re.split(r'\s*:\s*', lines[lines.index(header_line) + 2]) if header.strip()]

    data = []
    pattern = re.compile(r'([A-Za-z\s.]+):\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))\s+(\d+|\-|\(NA\))')
    for line in lines:
        match = pattern.match(line)
        if match:
            state = match.group(1).strip().rstrip('.')
            data.append([state] + [match.group(i) for i in range(2, 6)])

    df = pd.DataFrame(data, columns=headers).replace({'(NA)': None, '-': None}).iloc[:, [0, 2, 3]]
    df.columns = [df.columns[0], f"{df.columns[1]}_", f"{df.columns[2]}_"]
    return df

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
dataframes = [process_file(file_path) for file_path in file_paths]

merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df.drop(columns=[col for col in df.columns if col in merged_df.columns and col != 'State']), on='State', how='outer')

def extract_date(col):
    return datetime.strptime(col.split(',_')[0], '%B %d')

state_col = merged_df['State']
df_sorted = pd.concat([state_col, merged_df.drop('State', axis=1)[sorted(merged_df.columns.difference(['State']), key=extract_date)]], axis=1)

def extract_doy(col):
    return datetime.strptime(f"{year} {col.split(',_')[0]}", '%Y %B %d').timetuple().tm_yday

df_sorted.columns = ['State'] + [f"{extract_doy(col)}{col.split(',_')[1]} " for col in df_sorted.columns[1:]]
df_sorted.columns = df_sorted.columns.str.strip()

for col in df_sorted.columns[1:]:
    df_sorted[col] = pd.to_numeric(df_sorted[col], downcast='float', errors='coerce')

def calculate_mean_doy(row):
    doys = [int(col) for col in df_sorted.columns[1:] if not pd.isna(row[col]) and row[col] != 100]
    return int(np.ceil(np.mean(doys))) if doys else None

df_sorted['mean'] = df_sorted.apply(calculate_mean_doy, axis=1)

df_harvested = df_sorted.iloc[:, [0, -1]]
df_harvested.head()

Unnamed: 0,State,mean
0,Colorado,302
1,Illinois,288
2,Indiana,295
3,Iowa,292
4,Kansas,281


# Merging

In [77]:
# Rename columns explicitly before the merge if necessary
df_planted = df_planted.rename(columns={'mean': 'Planted'})
df_emerged = df_emerged.rename(columns={'mean': 'Emerged'})
df_silking = df_silking.rename(columns={'mean': 'Silking'})
df_dough = df_dough.rename(columns={'mean': 'Dough'})
df_dented = df_dented.rename(columns={'mean': 'Dented'})
df_mature = df_mature.rename(columns={'mean': 'Mature'})
df_harvested = df_harvested.rename(columns={'mean': 'Harvested'})

# Merge the DataFrames on 'State'
df_merge = pd.merge(df_planted, df_emerged, on='State')
df_merge = pd.merge(df_merge, df_silking, on='State')
df_merge = pd.merge(df_merge, df_dough, on='State')
df_merge = pd.merge(df_merge, df_dented, on='State')
df_merge = pd.merge(df_merge, df_mature, on='State')
df_merge = pd.merge(df_merge, df_harvested, on='State')

df_merge


Unnamed: 0,State,Planted,Emerged,Silking,Dough,Dented,Mature,Harvested
0,Colorado,131,152,211,232,243,260,302
1,Illinois,127,145,201,222,243,260,288
2,Indiana,131,145,197,225,246,264,295
3,Iowa,127,145,201,222,239,260,292
4,Kansas,124,141,201,222,243,257,281
5,Kentucky,124,141,201,222,239,260,288
6,Michigan,134,152,208,229,246,267,295
7,Minnesota,134,148,201,222,239,264,292
8,Missouri,127,141,201,222,243,260,288
9,Nebraska,131,145,204,225,243,260,292


In [78]:
df_merge.to_csv(f'/content/drive/MyDrive/Lab/CPR/2013/CPRs/CPR_States_{year}.csv', index=False)