# Create the train and test datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
total_grouped = pd.read_csv("data/total_grouped.csv")
total_grouped

Unnamed: 0,FSC-A,FSC-H,FSC-W,SSC-A,SSC-H,SSC-W,Comp-Alexa Fluor 700-A :: CD4,Comp-BUV737-A :: CD5,Comp-Brilliant Violet 421-A :: CD107a,Comp-Brilliant Violet 605-A :: CD8,Comp-FITC-A :: CD226,Comp-LIVE_DEAD Fixable Blue-A,Comp-PE-A :: TIGIT,Comp-PE-CF594-A :: CD197,Comp-PE-Cy5-A :: CD28,Comp-PE-Cy7-A :: CD45RA,Time :: CD226,class
0,321,316,260,216,285,194,557,734,809,272,378,390,663,510,664,205,0,CD4_tcm
1,335,288,298,138,274,129,486,487,807,258,277,307,236,395,523,626,1,CD4_tcm
2,336,304,283,174,270,165,591,609,806,232,317,389,611,421,500,249,3,CD4_tcm
3,295,276,274,83,263,81,519,557,638,259,250,282,254,492,466,638,4,CD4_tcm
4,307,293,268,161,275,150,575,575,792,289,273,386,189,405,583,561,5,CD4_tcm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807425,393,379,266,308,297,266,321,269,284,296,298,382,249,359,234,333,1023,debris
6807426,232,273,217,285,265,275,510,363,215,277,273,562,277,216,264,231,1023,debris
6807427,271,263,265,210,264,203,104,612,67,304,278,500,166,263,294,804,1023,debris
6807428,280,271,264,170,263,166,237,225,258,251,254,283,242,184,229,221,1023,debris


### Clean up column names:

In [3]:
total_grouped = total_grouped.rename(columns={
                              "FSC-A":"FSC_A",
                              "FSC-H":"FSC_H",
                              "FSC-W":"FSC_W",
                              "SSC-A":"SSC_A",
                              "SSC-H":"SSC_H",
                              "SSC-W":"SSC_W",
                              "Comp-Alexa Fluor 700-A :: CD4":"CD4",
                              "Comp-BUV737-A :: CD5":"CD5",
                              "Comp-Brilliant Violet 421-A :: CD107a":"CD107a",
                              "Comp-Brilliant Violet 605-A :: CD8":"CD8",
                              "Comp-FITC-A :: CD226":"CD226",
                              "Comp-LIVE_DEAD Fixable Blue-A":"LIVE_DEAD",
                              "Comp-PE-A :: TIGIT":"TIGIT",
                              "Comp-PE-CF594-A :: CD197":"CD197",
                              "Comp-PE-Cy5-A :: CD28":"CD28",
                              "Comp-PE-Cy7-A :: CD45RA":"CD45RA",
                              "Time :: CD226":"time", 
                             })
total_grouped

Unnamed: 0,FSC_A,FSC_H,FSC_W,SSC_A,SSC_H,SSC_W,CD4,CD5,CD107a,CD8,CD226,LIVE_DEAD,TIGIT,CD197,CD28,CD45RA,time,class
0,321,316,260,216,285,194,557,734,809,272,378,390,663,510,664,205,0,CD4_tcm
1,335,288,298,138,274,129,486,487,807,258,277,307,236,395,523,626,1,CD4_tcm
2,336,304,283,174,270,165,591,609,806,232,317,389,611,421,500,249,3,CD4_tcm
3,295,276,274,83,263,81,519,557,638,259,250,282,254,492,466,638,4,CD4_tcm
4,307,293,268,161,275,150,575,575,792,289,273,386,189,405,583,561,5,CD4_tcm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807425,393,379,266,308,297,266,321,269,284,296,298,382,249,359,234,333,1023,debris
6807426,232,273,217,285,265,275,510,363,215,277,273,562,277,216,264,231,1023,debris
6807427,271,263,265,210,264,203,104,612,67,304,278,500,166,263,294,804,1023,debris
6807428,280,271,264,170,263,166,237,225,258,251,254,283,242,184,229,221,1023,debris


### Remove the CD107a, CD226, TIGIT and CD28 markers as they are expected not to influence the CD4 and CD8 types used in this model. Which simplifies the model.

In [4]:
#col to drop CD107a, CD226, TIGIT,CD28
total_grouped = total_grouped.drop(['CD107a', 'CD226',"TIGIT", "CD28", "time"], axis=1)
total_grouped

Unnamed: 0,FSC_A,FSC_H,FSC_W,SSC_A,SSC_H,SSC_W,CD4,CD5,CD8,LIVE_DEAD,CD197,CD45RA,class
0,321,316,260,216,285,194,557,734,272,390,510,205,CD4_tcm
1,335,288,298,138,274,129,486,487,258,307,395,626,CD4_tcm
2,336,304,283,174,270,165,591,609,232,389,421,249,CD4_tcm
3,295,276,274,83,263,81,519,557,259,282,492,638,CD4_tcm
4,307,293,268,161,275,150,575,575,289,386,405,561,CD4_tcm
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6807425,393,379,266,308,297,266,321,269,296,382,359,333,debris
6807426,232,273,217,285,265,275,510,363,277,562,216,231,debris
6807427,271,263,265,210,264,203,104,612,304,500,263,804,debris
6807428,280,271,264,170,263,166,237,225,251,283,184,221,debris


In [5]:
# Create the train and test data sets:
train, test = train_test_split(total_grouped, test_size=0.2, random_state=5)

In [6]:
train.shape

(5445944, 13)

In [7]:
test.shape

(1361486, 13)

In [8]:
total_grouped.shape

(6807430, 13)

In [9]:
test.head()

Unnamed: 0,FSC_A,FSC_H,FSC_W,SSC_A,SSC_H,SSC_W,CD4,CD5,CD8,LIVE_DEAD,CD197,CD45RA,class
4392951,330,286,296,189,280,173,198,338,482,385,239,774,none_live
5851456,270,264,262,205,267,197,626,242,269,617,212,687,debris
1561533,357,292,313,154,277,143,589,560,260,356,486,788,CD4_th0
554944,321,287,286,114,273,107,579,566,247,305,264,271,CD4_tem
1786391,368,294,320,161,286,144,635,542,233,350,464,885,CD4_th0


In [10]:
train.to_csv("data/train_total.csv", index = False)
test.to_csv("data/test_total.csv", index = False)