# Convert files
We have a bunch of tsv files that need initial cleaning. At the end we save it as a pickle.


In [1]:
import os

import pandas as pd
import numpy as np

In [18]:
BASE = os.path.join(os.pardir, "data")

def list_tsv():
    """Return list of tsv files"""
      
    return [x for x in os.listdir(os.path.join(BASE, "tsv"))]

def simplify_col(col):
    """Return column as series with dtype float."""
    
    if df["2019M09 "].dtype == "float64":
        return df
    
    def _split_str(x):
        """Split string on space and return numbers."""
        
        x = x.split()
        return x[0] if isinstance(x, list) else x
    
    # If value is ':' > set as NaN, otherwise apply func above
    col = (np.where(col.str.contains(":"), 
                        pd.NA, 
                        col.apply(_split_str)))    
    
    # Return as numeric column
    return pd.to_numeric(col, errors="coerce")

def split_first_col(df):
    """Split first column and return as df."""
    
    # First save a list of other column names
    other_cols = list(df.columns[1:])
    
    # Split the first column in diff variables
    new_names = df.columns[0].split(",")
    new_names[-1] = new_names[-1].split("\\")[0]
    
    # Save the variables as new columns
    df[new_names] = df.iloc[:,0].str.split(",", expand=True) 
    df = df[new_names + other_cols]
    
    # Clean the column names
    df.columns = [col.lower().strip() for col in df.columns]
        
    return df

In [3]:
def clean_df(filename):
    """Take filename, create & clean df and save as pickel."""
    
    df = pd.read_csv(os.path.join(BASE, "tsv", filename), sep="\t")
    
    # Set : values to pd.NA and rest to numeric
    for col in df.columns[1:]:
        df[col] = simplify_col(df[col])
    
    # Convert variables in first column to seperate columns
    df = split_first_col(df) 
    df.to_pickle(os.path.join(BASE, "pickles", f"{filename}.pkl"))
    
    return filename

In [19]:
# Execute the code and print results

files = list_tsv()

for filename in files:
    print(clean_df(filename))

teilm021.tsv
teilm022.tsv
tepsr_wc120.tsv
tps00203.tsv
