In [1]:
import pandas as pd

## Series
- "Массив" данных
- Дополнительно имеет индект и молекулярный вес

In [123]:
mw = pd.Series([46.07, 16.04, 78.11, 58.08])
mw

0    46.07
1    16.04
2    78.11
3    58.08
dtype: float64

In [124]:
mw[0]

46.07

In [125]:
mw.iloc[0]

46.07

In [126]:
mw = pd.Series(
    [46.07, 16.04, 78.11, 58.08],
    index=["Ethanol", "Methane", "Benzene", "Acetone"],
    name="MolecularWeight"
)
mw

Ethanol    46.07
Methane    16.04
Benzene    78.11
Acetone    58.08
Name: MolecularWeight, dtype: float64

In [127]:
mw.iloc[0]

46.07

In [128]:
mw["Benzene"]

78.11

In [129]:
## Векторизованные операции

In [130]:
mw**2

Ethanol    2122.4449
Methane     257.2816
Benzene    6101.1721
Acetone    3373.2864
Name: MolecularWeight, dtype: float64

In [131]:
def my_sq(x):
    return x**2

mw.apply(lambda x: x**2)

Ethanol    2122.4449
Methane     257.2816
Benzene    6101.1721
Acetone    3373.2864
Name: MolecularWeight, dtype: float64

In [132]:
mw.apply(lambda x: x**2)

Ethanol    2122.4449
Methane     257.2816
Benzene    6101.1721
Acetone    3373.2864
Name: MolecularWeight, dtype: float64

In [133]:
print("Mean MW:", mw.mean())
print("Min MW:", mw.min())
print("Max MW:", mw.max())

Mean MW: 49.575
Min MW: 16.04
Max MW: 78.11


In [134]:
print("Sorted:\n", mw.sort_values())

Sorted:
 Methane    16.04
Ethanol    46.07
Acetone    58.08
Benzene    78.11
Name: MolecularWeight, dtype: float64


In [135]:
mw.T # Одиночные серии не транспонируются

Ethanol    46.07
Methane    16.04
Benzene    78.11
Acetone    58.08
Name: MolecularWeight, dtype: float64

In [136]:
mw

Ethanol    46.07
Methane    16.04
Benzene    78.11
Acetone    58.08
Name: MolecularWeight, dtype: float64

In [137]:
mw.T.values

array([46.07, 16.04, 78.11, 58.08])

In [138]:
mw.values

array([46.07, 16.04, 78.11, 58.08])

In [139]:
## Фильтрация

In [140]:
mw > 20

Ethanol     True
Methane    False
Benzene     True
Acetone     True
Name: MolecularWeight, dtype: bool

In [141]:
mw[mw > 20]

Ethanol    46.07
Benzene    78.11
Acetone    58.08
Name: MolecularWeight, dtype: float64

In [142]:
mw[(20 < mw) & (mw < 50)]

Ethanol    46.07
Name: MolecularWeight, dtype: float64

In [143]:
mw[(20 < mw) and (mw < 50)] # НЕ БУДЕТ РАБОТАТЬ!

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## DataFrame создание

In [144]:
import pandas as pd

data = {
    "Molecule": ["Ethanol", "Methane", "Benzene", "Acetone"],
    "MolecularWeight": [46.07, 16.04, 78.11, 58.08],
    "LogP": [-0.31, 1.09, 2.13, -0.24],
}

df = pd.DataFrame(data)
print(df)


  Molecule  MolecularWeight  LogP
0  Ethanol            46.07 -0.31
1  Methane            16.04  1.09
2  Benzene            78.11  2.13
3  Acetone            58.08 -0.24


In [145]:
records = [
    {"Molecule": "Ethanol", "MolecularWeight": 46.07, "LogP": -0.31},
    {"Molecule": "Methane", "MolecularWeight": 16.04, "LogP": 1.09},
    {"Molecule": "Benzene", "MolecularWeight": 78.11, "LogP": 2.13},
    {"Molecule": "Acetone", "MolecularWeight": 58.08, "LogP": -0.24},
]

df = pd.DataFrame(records)
print(df)


  Molecule  MolecularWeight  LogP
0  Ethanol            46.07 -0.31
1  Methane            16.04  1.09
2  Benzene            78.11  2.13
3  Acetone            58.08 -0.24


In [55]:
mw = pd.Series([46.07, 16.04, 78.11, 58.08],
               index=["Ethanol", "Methane", "Benzene", "Acetone"])
logp = pd.Series([-0.31, 1.09, 2.13, -0.24],
                 index=["Ethanol", "Methane", "Benzene", "Acetone"])

df = pd.DataFrame({"MolecularWeight": mw, "LogP": logp})
print(df)


         MolecularWeight  LogP
Ethanol            46.07 -0.31
Methane            16.04  1.09
Benzene            78.11  2.13
Acetone            58.08 -0.24


In [58]:
data = [
    ["Ethanol", 46.07, -0.31],
    ["Methane", 16.04, 1.09],
    ["Benzene", 78.11, 2.13],
    ["Acetone", 58.08, -0.24],
]

df = pd.DataFrame(data, columns=["Molecule", "MolecularWeight", "LogP"])
print(df)

  Molecule  MolecularWeight  LogP
0  Ethanol            46.07 -0.31
1  Methane            16.04  1.09
2  Benzene            78.11  2.13
3  Acetone            58.08 -0.24


In [63]:
import numpy as np

arr = np.array([
    [46.07, -0.31],
    [16.04, 1.09],
    [78.11, 2.13],
    [58.08, -0.24],
])

df = pd.DataFrame(arr, columns=["MolecularWeight", "LogP"],
                  index=["Ethanol", "Methane", "Benzene", "Acetone"])
print(df)


         MolecularWeight  LogP
Ethanol            46.07 -0.31
Methane            16.04  1.09
Benzene            78.11  2.13
Acetone            58.08 -0.24


In [61]:
# Save
#df.to_csv("molecules.csv", index=False)

# Load
df2 = pd.read_csv("molecules.csv")
print(df2.head())


  Molecule  MolecularWeight  LogP
0  Ethanol            46.07 -0.31
1  Methane            16.04  1.09
2  Benzene            78.11  2.13
3  Acetone            58.08 -0.24


In [62]:
from rdkit import Chem
from rdkit.Chem import Descriptors

smiles = ["CCO", "C", "c1ccccc1", "CC(=O)C"]
names = ["Ethanol", "Methane", "Benzene", "Acetone"]

df = pd.DataFrame({
    "Molecule": names,
    "SMILES": smiles,
})
df["MolWt"] = df["SMILES"].apply(lambda s: Descriptors.MolWt(Chem.MolFromSmiles(s)))
print(df)


  Molecule    SMILES   MolWt
0  Ethanol       CCO  46.069
1  Methane         C  16.043
2  Benzene  c1ccccc1  78.114
3  Acetone   CC(=O)C  58.080


In [64]:
# DataFrame очистка данных

In [86]:
import pandas as pd

data = {
    "Molecule": ["Ethanol", "Methane", "Benzene", "Acetone", None, "Hexane"],
    "MolecularWeight": [46.07, 16.04, None, 58.08, 180.15, "??"],
    "LogP": [-0.31, 1.09, 2.13, None, 4.50, 3.57]
}

df = pd.DataFrame(data)
print(df)


  Molecule MolecularWeight  LogP
0  Ethanol           46.07 -0.31
1  Methane           16.04  1.09
2  Benzene            None  2.13
3  Acetone           58.08   NaN
4     None          180.15  4.50
5   Hexane              ??  3.57


In [87]:
df.dtypes

Molecule            object
MolecularWeight     object
LogP               float64
dtype: object

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Molecule         5 non-null      object 
 1   MolecularWeight  5 non-null      object 
 2   LogP             5 non-null      float64
dtypes: float64(1), object(2)
memory usage: 272.0+ bytes


In [89]:
df.isnull().sum()

Molecule           1
MolecularWeight    1
LogP               1
dtype: int64

In [90]:
df = df.dropna()
df

Unnamed: 0,Molecule,MolecularWeight,LogP
0,Ethanol,46.07,-0.31
1,Methane,16.04,1.09
5,Hexane,??,3.57


In [91]:
df["MolecularWeight"] = pd.to_numeric(df["MolecularWeight"], errors="coerce")
df

Unnamed: 0,Molecule,MolecularWeight,LogP
0,Ethanol,46.07,-0.31
1,Methane,16.04,1.09
5,Hexane,,3.57


In [92]:
df["MolecularWeight"] = df["MolecularWeight"].fillna(df["MolecularWeight"].mean())
df["LogP"] = df["LogP"].fillna(df["LogP"].mean())
df

Unnamed: 0,Molecule,MolecularWeight,LogP
0,Ethanol,46.07,-0.31
1,Methane,16.04,1.09
5,Hexane,31.055,3.57


# Group by

In [146]:
data = {
    "Molecule": ["Ethanol", "Methane", "Benzene", "Acetone", "Lauric acid"],
    "MolecularWeight": [46.07, 16.04, 78.11, 58.08, 200.32],
    "LogP": [-0.31, 1.09, 2.13, -0.24, 4.60]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Molecule,MolecularWeight,LogP
0,Ethanol,46.07,-0.31
1,Methane,16.04,1.09
2,Benzene,78.11,2.13
3,Acetone,58.08,-0.24
4,Lauric acid,200.32,4.6


In [147]:
df["Class"] = ["alcohol", "alkane", "aromatic", "ketone", "lipid"]
df

Unnamed: 0,Molecule,MolecularWeight,LogP,Class
0,Ethanol,46.07,-0.31,alcohol
1,Methane,16.04,1.09,alkane
2,Benzene,78.11,2.13,aromatic
3,Acetone,58.08,-0.24,ketone
4,Lauric acid,200.32,4.6,lipid


In [151]:
grouped = df.groupby("Class").agg({
    "MolecularWeight": ["mean", "max"],
    "LogP": "mean"
})
print(grouped)


         MolecularWeight          LogP
                    mean     max  mean
Class                                 
alcohol            46.07   46.07 -0.31
alkane             16.04   16.04  1.09
aromatic           78.11   78.11  2.13
ketone             58.08   58.08 -0.24
lipid             200.32  200.32  4.60


In [152]:
df

Unnamed: 0,Molecule,MolecularWeight,LogP,Class
0,Ethanol,46.07,-0.31,alcohol
1,Methane,16.04,1.09,alkane
2,Benzene,78.11,2.13,aromatic
3,Acetone,58.08,-0.24,ketone
4,Lauric acid,200.32,4.6,lipid


In [154]:
# Example: within each class, normalize LogP by its mean
df["LogP_class_norm"] = df.groupby("Class")["LogP"].transform(lambda x: x / x.mean())
df

Unnamed: 0,Molecule,MolecularWeight,LogP,Class,LogP_class_norm
0,Ethanol,46.07,-0.31,alcohol,1.0
1,Methane,16.04,1.09,alkane,1.0
2,Benzene,78.11,2.13,aromatic,1.0
3,Acetone,58.08,-0.24,ketone,1.0
4,Lauric acid,200.32,4.6,lipid,1.0
