/
csv.py
149 lines (132 loc) · 4.63 KB
/
csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# author: Scott Gigante <scott.gigante@yale.edu>
# (C) 2018 Krishnaswamy Lab GPLv2
import pandas as pd
from .utils import _matrix_to_data_frame
from .. import utils
def _read_csv_sparse(filename, chunksize=1000000, fill_value=0.0, **kwargs):
"""Read a csv file into a pd.DataFrame[pd.SparseArray]
"""
chunks = pd.read_csv(filename, chunksize=chunksize, **kwargs)
data = pd.concat(
utils.dataframe_to_sparse(chunk, fill_value=fill_value) for chunk in chunks
)
return data
def load_csv(
filename,
cell_axis="row",
delimiter=",",
gene_names=True,
cell_names=True,
sparse=False,
**kwargs
):
"""Load a csv file
Parameters
----------
filename : str
The name of the csv file to be loaded
cell_axis : {'row', 'column'}, optional (default: 'row')
If your data has genes on the rows and cells on the columns, use
cell_axis='column'
delimiter : str, optional (default: ',')
Use '\\t' for tab separated values (tsv)
gene_names : `bool`, `str`, array-like, or `None` (default: True)
If `True`, we assume gene names are in the first row/column. Otherwise
expects a filename or an array containing a list of gene symbols or ids
cell_names : `bool`, `str`, array-like, or `None` (default: True)
If `True`, we assume cell names are in the first row/column. Otherwise
expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: False)
If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
but more CPU.
**kwargs : optional arguments for `pd.read_csv`.
Returns
-------
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
if cell_axis not in ["row", "column", "col"]:
raise ValueError(
"cell_axis {} not recognized. Expected 'row' or 'column'".format(cell_axis)
)
if "index_col" in kwargs:
# override
index_col = kwargs["index_col"]
cell_names = None
del kwargs["index_col"]
elif cell_names is True:
index_col = 0
cell_names = None
else:
index_col = None
if "header" in kwargs:
# override
header = kwargs["header"]
del kwargs["header"]
gene_names = None
elif gene_names is True:
header = 0
gene_names = None
else:
header = None
# Read in csv file
if sparse:
read_fun = _read_csv_sparse
else:
read_fun = pd.read_csv
data = read_fun(
filename, delimiter=delimiter, header=header, index_col=index_col, **kwargs
)
if cell_axis in ["column", "col"]:
data = data.T
data = _matrix_to_data_frame(
data, gene_names=gene_names, cell_names=cell_names, sparse=sparse
)
return data
def load_tsv(
filename,
cell_axis="row",
delimiter="\t",
gene_names=True,
cell_names=True,
sparse=False,
**kwargs
):
"""Load a tsv file
Parameters
----------
filename : str
The name of the csv file to be loaded
cell_axis : {'row', 'column'}, optional (default: 'row')
If your data has genes on the rows and cells on the columns, use
cell_axis='column'
delimiter : str, optional (default: '\\t')
Use ',' for comma separated values (csv)
gene_names : `bool`, `str`, array-like, or `None` (default: True)
If `True`, we assume gene names are in the first row/column. Otherwise
expects a filename or an array containing a list of gene symbols or ids
cell_names : `bool`, `str`, array-like, or `None` (default: True)
If `True`, we assume cell names are in the first row/column. Otherwise
expects a filename or an array containing a list of cell barcodes.
sparse : bool, optional (default: False)
If True, loads the data as a pd.DataFrame[pd.SparseArray]. This uses less memory
but more CPU.
**kwargs : optional arguments for `pd.read_csv`.
Returns
-------
data : array-like, shape=[n_samples, n_features]
If either gene or cell names are given, data will be a pd.DataFrame or
pd.DataFrame[pd.SparseArray]. If no names are given, data will be a np.ndarray
or scipy.sparse.spmatrix
"""
return load_csv(
filename,
cell_axis=cell_axis,
delimiter=delimiter,
gene_names=gene_names,
cell_names=cell_names,
sparse=sparse,
**kwargs
)