# Convert files
We have a bunch of tsv files that need initial cleaning. At the end we save it as a pickle.


In [1]:
import os

import pandas as pd
import numpy as np

BASE = os.path.join(os.pardir, "data")


In [2]:
# Functions

def list_tsv():
    """Return list of tsv files"""
      
    return [x for x in os.listdir(os.path.join(BASE, "tsv"))]

def simplify_col(col):
    """Return column as series with dtype float."""
    
    # Skip if already numeric
    if col.dtype in ["float64", "int64"]:
        return col
    
    def _split_str(x):
        """Split string on space and return numbers."""
        
        x = x.split()
        return x[0] if isinstance(x, list) else x
    
    # If value is ':' > set as NaN, otherwise apply func above
    col = (np.where(col.str.contains(":"), 
                        pd.NA, 
                        col.apply(_split_str)))    
    
    # Return as numeric column
    return pd.to_numeric(col, errors="coerce")

def split_first_col(df):
    """Split first column and return as df."""
    
    # First save a list of other column names
    other_cols = list(df.columns[1:])
    
    # Split the first column in diff variables
    new_names = df.columns[0].split(",")
    new_names[-1] = new_names[-1].split("\\")[0]
    
    # Save the variables as new columns
    df[new_names] = df.iloc[:,0].str.split(",", expand=True) 
    df = df[new_names + other_cols]
    
    # Clean the column names
    df.columns = [col.lower().strip() for col in df.columns]
        
    return df

In [3]:
# Main function

def clean_df(filename, replace=True):
    """Take filename, create & clean df and save as pickel."""
    
    # Skip file it exists and if flag set to false
    newname = filename.split(".")[0]
    if (not replace 
        and os.path.exists(os.path.join(BASE, "pickles", f"{newname}.pkl"))):
        return f"{filename} skipped"
    
    # Open file
    df = pd.read_csv(os.path.join(BASE, "tsv", filename), sep="\t")
    
    # Set ':' values to pd.NA and rest to numeric
    for col in df.columns[1:]:
        df[col] = simplify_col(df[col])
    
    # Convert variables in first column to seperate columns
    df = split_first_col(df) 
    
    # Save files
    df.to_pickle(os.path.join(BASE, "pickles", f"{newname}.pkl"))
    df.to_csv(os.path.join(BASE, "pickles", f"{newname}.csv"))
    
    return filename

In [4]:
# Execute the code and print results

files = list_tsv()

for filename in files:
    print(clean_df(filename, replace=False))

ilc_di12.tsv
lfsq_egan.tsv skipped
sdg_08_40.tsv skipped
t2020_10.tsv skipped
tec00114.tsv skipped
teilm010.tsv skipped
teilm011.tsv skipped
teilm012.tsv skipped
teilm020.tsv skipped
teilm021.tsv skipped
teilm022.tsv skipped
tepsr_wc120.tsv skipped
tgs00007.tsv skipped
tgs00010.tsv skipped
tgs00053.tsv skipped
tgs00102.tsv skipped
tps00066.tsv skipped
tps00070.tsv skipped
tps00071.tsv skipped
tps00073.tsv skipped
tps00074.tsv skipped
tps00159.tsv skipped
tps00181.tsv skipped
tps00182.tsv skipped
tps00203.tsv skipped
